mindspore/patches/0005-micro-for-ohos.patch

diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake
index 2254c2a7..f15724f1 100644
--- a/cmake/package_lite.cmake
+++ b/cmake/package_lite.cmake
@@ -474,7 +474,7 @@ if(PLATFORM_ARM64)
             COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ops*" EXCLUDE)
     install(DIRECTORY ${TOP_DIR}/include/c_api/ DESTINATION ${RUNTIME_INC_DIR}/c_api
             COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h")
-    if(ANDROID_NDK_TOOLCHAIN_INCLUDED OR MSLITE_ENABLE_CONVERTER OR TARGET_HIMIX)
+    if(ANDROID_NDK_TOOLCHAIN_INCLUDED OR MSLITE_ENABLE_CONVERTER OR TARGET_HIMIX OR TARGET_OHOS)
         __install_micro_wrapper()
     endif()
     if(MSLITE_ENABLE_RUNTIME_GLOG)
diff --git a/mindspore/ccsrc/backend/common/optimizer/pass.h b/mindspore/ccsrc/backend/common/optimizer/pass.h
new file mode 100644
index 00000000..8d396164
--- /dev/null
+++ b/mindspore/ccsrc/backend/common/optimizer/pass.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_COMMON_PASS_H_
+#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_COMMON_PASS_H_
+#include <memory>
+#include <string>
+#include "ir/anf.h"
+#include "mindspore/core/ops/array_ops.h"
+#include "mindspore/core/ops/lite_ops.h"
+#include "utils/trace_base.h"
+
+namespace mindspore {
+namespace opt {
+class CacheManager;
+using CacheManagerPtr = std::shared_ptr<CacheManager>;
+
+// @brief ANF Graph level optimization base pass
+class Pass {
+public:
+  explicit Pass(const std::string &name = "pass") : name_(name) {}
+  virtual ~Pass() = default;
+  virtual bool Run(const FuncGraphPtr &fun_graph) = 0;
+  const std::string &name() const { return name_;}
+  void SetCacheManager(const CacheManagerPtr &cm) { cache_manager_ = cm;}
+  const CacheManagerPtr &GetCacheManager() const {return cache_manager_;}
+
+private:
+  const std::string name_;
+  CacheManagerPtr cache_manager_;
+};
+using PassPtr = std::shared_ptr<Pass>;
+}  // namespace opt
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_COMMON_PASS_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.cc
index 55bbddac..378ef00c 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.cc
@@ -60,6 +60,8 @@ bool LstmCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vec
   hidden_size_ = kernel_ptr->get_hidden_size();
   num_layers_ = kernel_ptr->get_num_layers();
   has_bias_ = kernel_ptr->get_has_bias();
+  proj_size_ = kernel_ptr->get_proj_size();
+  real_hidden_size_ = proj_size_ > 0 ? proj_size_ : hidden_size_;
   constexpr int kBidirectional = 2;
   num_directions_ = 1;
   if (bidirectional_) {
@@ -73,14 +75,20 @@ bool LstmCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vec
     MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
   }

+  weight_size_ = 0;
+  weight_h_size_ = 0;
+  weight_r_size_ = 0;
   for (int i = 0; i < num_layers_; ++i) {
     weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
-    weight_h_size_ += gate_size * hidden_size_;
+    weight_h_size_ += gate_size * real_hidden_size_;
+    weight_r_size_ += hidden_size_ * proj_size_;
   }
   weight_size_ = weight_size_ * num_directions_;
   weight_h_size_ = weight_h_size_ * num_directions_;
+  weight_r_size_ = weight_r_size_ * num_directions_;
   weights_dims_ = {num_layers_, num_directions_, input_size_, kGateNum, hidden_size_};
-  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, kGateNum, hidden_size_};
+  weights_h_dims_ = {num_layers_, num_directions_, real_hidden_size_, kGateNum, hidden_size_};
+  weights_r_dims_ = {num_layers_, num_directions_, hidden_size_, proj_size_};
   bias_dims_ = {num_layers_, num_directions_, kGateNum, hidden_size_};
   is_training_ =
     base_operator->HasAttr(kAttrIsTraining) ? GetValue<bool>(base_operator->GetAttr(kAttrIsTraining)) : true;
@@ -110,10 +118,10 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve
     direction = dnnl::rnn_direction::bidirectional_concat;
   }
   dim src_dims = {seq_len_, batch_size_, input_size_};
-  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim src_h_dims = {num_layers_, num_directions_, batch_size_, real_hidden_size_};
   dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
-  dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
-  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim dst_dims = {seq_len_, batch_size_, real_hidden_size_ * num_directions_};
+  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, real_hidden_size_};
   dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
   dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
   dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
@@ -126,13 +134,16 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve
   auto prop_kind = is_training_ ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_inference;
   auto weights_desc = formatted_md(weights_dims_, tag::any);
   auto weights_h_desc = formatted_md(weights_h_dims_, tag::any);
-  auto desc =
-    CreatePrimitive<dnnl::lstm_forward::desc>(prop_kind, direction, src_desc, src_h_desc, src_c_desc, weights_desc,
-                                              weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
+  auto weights_r_desc = proj_size_ > 0 ? formatted_md(weights_r_dims_, tag::any) : dnnl::memory::desc();
+  auto peephole_desc = dnnl::memory::desc();
+  auto desc = CreatePrimitive<dnnl::lstm_forward::desc>(prop_kind, direction, src_desc, src_h_desc, src_c_desc,
+                                                        weights_desc, weights_h_desc, peephole_desc, weights_r_desc,
+                                                        bias_desc, dst_desc, dst_h_desc, dst_c_desc);
   prim_desc_ = CreateDesc<dnnl::lstm_forward::primitive_desc>(*desc, engine_);
   primitive_ = CreatePrimitive<dnnl::lstm_forward>(prim_desc_);
   auto weights_layer = GetWeightsLayerDesc(prim_desc_);
   auto weights_iter = GetWeightsIterDesc(prim_desc_);
+  auto weights_proj = GetWeightsProjectionDesc(prim_desc_);
   bias_desc_ = GetBiasDesc(prim_desc_);
   if (is_training_) {
     auto wksp_desc = GetWorkspaceDesc(prim_desc_);
@@ -144,6 +155,7 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve
   AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
   AddArgument(DNNL_ARG_WEIGHTS_LAYER, weights_layer);
   AddArgument(DNNL_ARG_WEIGHTS_ITER, weights_iter);
+  AddArgument(DNNL_ARG_WEIGHTS_PROJECTION, weights_proj);
   AddArgument(DNNL_ARG_BIAS, bias_desc);
   AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
   AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
@@ -151,10 +163,13 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve

   auto weights_dims_desc = CreateDesc<dnnl::memory::desc>(weights_dims_, dt::f32, tag::ldgoi);
   auto weights_h_dims_desc = CreateDesc<dnnl::memory::desc>(weights_h_dims_, dt::f32, tag::ldgoi);
+  auto weights_r_dims_desc = CreateDesc<dnnl::memory::desc>(weights_r_dims_, dt::f32, tag::ldoi);
   user_weights_memory_ = CreateDesc<dnnl::memory>(weights_dims_desc, engine_);
   user_weights_h_memory_ = CreateDesc<dnnl::memory>(weights_h_dims_desc, engine_);
+  user_weights_r_memory_ = CreateDesc<dnnl::memory>(weights_r_dims_desc, engine_);
   weights_memory_ = CreateDesc<dnnl::memory>(weights_layer, engine_);
   weights_h_memory_ = CreateDesc<dnnl::memory>(weights_iter, engine_);
+  weights_r_memory_ = CreateDesc<dnnl::memory>(weights_proj, engine_);
   bias_memory_ = CreateDesc<dnnl::memory>(bias_desc_, engine_);

   InitOutputSize(outputs);
@@ -163,13 +178,20 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve

 bool LstmCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                               const std::vector<kernel::AddressPtr> &outputs) {
+  size_t offset = 0;
   SetDataHandle(user_weights_memory_, inputs[kInputWeightIndex]->addr);
-  SetDataHandle(user_weights_h_memory_, reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + weight_size_);
+  offset += weight_size_;
+  SetDataHandle(user_weights_h_memory_, reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + offset);
+  offset += weight_h_size_;
   Reorder(&user_weights_memory_, &weights_memory_);
   Reorder(&user_weights_h_memory_, &weights_h_memory_);
+  if (proj_size_ > 0) {
+    SetDataHandle(user_weights_r_memory_, reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + offset);
+    Reorder(&user_weights_r_memory_, &weights_r_memory_);
+    offset += weight_r_size_;
+  }
   if (has_bias_) {
-    SetDataHandle(bias_memory_,
-                  reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + weight_size_ + weight_h_size_);
+    SetDataHandle(bias_memory_, reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + offset);
   } else {
     auto size = GetSize(bias_desc_);
     if (memset_s(GetDataHandle(bias_memory_), size, 0, size) != EOK) {
@@ -182,6 +204,7 @@ bool LstmCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs, con
   SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[kInputCIndex]->addr);
   SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, GetDataHandle(weights_memory_));
   SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, GetDataHandle(weights_h_memory_));
+  SetArgumentHandle(DNNL_ARG_WEIGHTS_PROJECTION, GetDataHandle(weights_r_memory_));
   SetArgumentHandle(DNNL_ARG_BIAS, GetDataHandle(bias_memory_));
   SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr);
   SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.h
index 42609eed..a0241c16 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.h
@@ -58,14 +58,17 @@ class LstmCpuKernelMod : public MKLCpuKernelMod {
  private:
   void InitOutputSize(const std::vector<KernelTensorPtr> &outputs);

-  int weight_size_{0};
-  int weight_h_size_{0};
-  int input_size_{0};
-  int hidden_size_{0};
-  int num_layers_{0};
-  int batch_size_{0};
-  int seq_len_{0};
-  int num_directions_{0};
+  int64_t weight_size_{0};
+  int64_t weight_h_size_{0};
+  int64_t weight_r_size_{0};
+  int64_t input_size_{0};
+  int64_t hidden_size_{0};
+  int64_t num_layers_{0};
+  int64_t batch_size_{0};
+  int64_t seq_len_{0};
+  int64_t num_directions_{0};
+  int64_t proj_size_{0};
+  int64_t real_hidden_size_{0};
   bool bidirectional_{false};
   bool has_bias_{false};
   bool is_training_{false};
@@ -73,13 +76,16 @@ class LstmCpuKernelMod : public MKLCpuKernelMod {

   dnnl::memory::dims weights_dims_;
   dnnl::memory::dims weights_h_dims_;
+  dnnl::memory::dims weights_r_dims_;
   dnnl::memory::dims bias_dims_;
   dnnl::lstm_forward::primitive_desc prim_desc_;
   dnnl::memory::desc bias_desc_;
   dnnl::memory user_weights_memory_;
   dnnl::memory user_weights_h_memory_;
+  dnnl::memory user_weights_r_memory_;
   dnnl::memory weights_memory_;
   dnnl::memory weights_h_memory_;
+  dnnl::memory weights_r_memory_;
   dnnl::memory bias_memory_;
 };
 }  // namespace kernel
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.cc
index aa1f8b44..0b5d09c1 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.cc
@@ -62,6 +62,8 @@ bool LSTMGradCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std:
   hidden_size_ = op_prim->get_hidden_size();
   num_layers_ = op_prim->get_num_layers();
   has_bias_ = op_prim->get_has_bias();
+  proj_size_ = op_prim->get_proj_size();
+  real_hidden_size_ = proj_size_ > 0 ? proj_size_ : hidden_size_;
   auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs);
   auto match = MatchKernelAttr(kernel_attr, GetOpSupport());
   if (!match.first) {
@@ -103,12 +105,15 @@ int LSTMGradCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std
   }
   weight_size_ = 0;
   weight_h_size_ = 0;
+  weight_r_size_ = 0;
   for (int64_t i = 0; i < num_layers_; ++i) {
     weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
-    weight_h_size_ += gate_size * hidden_size_;
+    weight_h_size_ += gate_size * real_hidden_size_;
+    weight_r_size_ += proj_size_ * hidden_size_;
   }
   weight_size_ = weight_size_ * num_directions_;
   weight_h_size_ = weight_h_size_ * num_directions_;
+  weight_r_size_ = weight_r_size_ * num_directions_;
   if (num_directions_ * num_layers_ != src_h_shape[0]) {
     MS_LOG(ERROR) << "Error iteration shape!";
     return KRET_RESIZE_FAILED;
@@ -124,13 +129,14 @@ void LSTMGradCpuKernelMod::InitDnnl() {
     direction = dnnl::rnn_direction::bidirectional_concat;
   }
   dim src_dims = {seq_len_, batch_size_, input_size_};
-  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim src_h_dims = {num_layers_, num_directions_, batch_size_, real_hidden_size_};
   dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
   weights_dims_ = {num_layers_, num_directions_, input_size_, kNumberFour, hidden_size_};
-  weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, kNumberFour, hidden_size_};
+  weights_h_dims_ = {num_layers_, num_directions_, real_hidden_size_, kNumberFour, hidden_size_};
+  weights_r_dims_ = {num_layers_, num_directions_, hidden_size_, proj_size_};
   bias_dims_ = {num_layers_, num_directions_, kNumberFour, hidden_size_};
-  dim dst_dims = {seq_len_, batch_size_, static_cast<int64_t>(hidden_size_) * num_directions_};
-  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim dst_dims = {seq_len_, batch_size_, real_hidden_size_ * num_directions_};
+  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, real_hidden_size_};
   dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
   dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
   dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
@@ -141,15 +147,17 @@ void LSTMGradCpuKernelMod::InitDnnl() {
   dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
   auto weights_desc = formatted_md(weights_dims_, tag::any);
   auto weights_h_desc = formatted_md(weights_h_dims_, tag::any);
+  auto weights_r_desc = proj_size_ > 0 ? formatted_md(weights_r_dims_, tag::any) : dnnl::memory::desc();
+  auto peepole_desc = dnnl::memory::desc();

-  auto forward_desc = CreatePrimitive<dnnl::lstm_forward::desc>(dnnl::prop_kind::forward_training, direction, src_desc,
-                                                                src_h_desc, src_c_desc, weights_desc, weights_h_desc,
-                                                                bias_desc, dst_desc, dst_h_desc, dst_c_desc);
+  auto forward_desc = CreatePrimitive<dnnl::lstm_forward::desc>(
+    dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc, weights_desc, weights_h_desc,
+    peepole_desc, weights_r_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
   auto prim_forward_desc = CreateDesc<dnnl::lstm_forward::primitive_desc>(*forward_desc, eng);
   auto backward_desc = CreatePrimitive<dnnl::lstm_backward::desc>(
-    dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, weights_desc, weights_h_desc, bias_desc,
-    dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc, src_c_desc, weights_desc, weights_h_desc, bias_desc,
-    dst_desc, dst_h_desc, dst_c_desc);
+    dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, weights_desc, weights_h_desc, peepole_desc,
+    weights_r_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc, src_c_desc, weights_desc,
+    weights_h_desc, peepole_desc, weights_r_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
   prim_backward_desc_ = CreateDesc<dnnl::lstm_backward::primitive_desc>(*backward_desc, eng, prim_forward_desc);
   primitive_ = CreatePrimitive<dnnl::lstm_backward>(prim_backward_desc_);
   auto wksp_desc = GetWorkspaceDesc(prim_forward_desc);
@@ -159,24 +167,31 @@ void LSTMGradCpuKernelMod::InitDnnl() {
   // construct fw memory
   weights_layer_desc_ = GetWeightsLayerDesc(prim_backward_desc_);
   weights_iter_desc_ = GetWeightsIterDesc(prim_backward_desc_);
+  weights_proj_desc_ = GetWeightsProjectionDesc(prim_backward_desc_);
   bias_desc_ = GetBiasDesc(prim_backward_desc_);
   auto weights_mem_desc = CreateDesc<dnnl::memory::desc>(weights_dims_, dt::f32, tag::ldgoi);
   auto weights_h_mem_desc = CreateDesc<dnnl::memory::desc>(weights_h_dims_, dt::f32, tag::ldgoi);
+  auto weights_r_mem_desc = CreateDesc<dnnl::memory::desc>(weights_r_dims_, dt::f32, tag::ldoi);
   user_weights_memory_ = CreateDesc<dnnl::memory>(weights_mem_desc, eng);
   user_weights_h_memory_ = CreateDesc<dnnl::memory>(weights_h_mem_desc, eng);
+  user_weights_r_memory_ = CreateDesc<dnnl::memory>(weights_r_mem_desc, eng);
   weights_memory_ = CreateDesc<dnnl::memory>(weights_layer_desc_, eng);
   weights_h_memory_ = CreateDesc<dnnl::memory>(weights_iter_desc_, eng);
+  weights_r_memory_ = CreateDesc<dnnl::memory>(weights_proj_desc_, eng);
   bias_memory_ = CreateDesc<dnnl::memory>(bias_desc_, eng);

   // construct bw memory
   diff_weights_layer_desc_ = GetDiffWeightsLayerDesc(prim_backward_desc_);
   diff_weights_iter_desc_ = GetDiffWeightsIterDesc(prim_backward_desc_);
+  diff_weights_proj_desc_ = GetDiffWeightsProjectionDesc(prim_backward_desc_);
   diff_bias_desc_ = GetDiffBiasDesc(prim_backward_desc_);
   diff_weights_memory_ = CreateDesc<dnnl::memory>(diff_weights_layer_desc_, eng);
   diff_weights_h_memory_ = CreateDesc<dnnl::memory>(diff_weights_iter_desc_, eng);
+  diff_weights_r_memory_ = CreateDesc<dnnl::memory>(diff_weights_proj_desc_, eng);
   diff_bias_memory_ = CreateDesc<dnnl::memory>(diff_bias_desc_, eng);
   user_diff_weights_memory_ = CreateDesc<dnnl::memory>(weights_mem_desc, eng);
   user_diff_weights_h_memory_ = CreateDesc<dnnl::memory>(weights_h_mem_desc, eng);
+  user_diff_weights_r_memory_ = CreateDesc<dnnl::memory>(weights_r_mem_desc, eng);
 }

 void LSTMGradCpuKernelMod::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
@@ -188,6 +203,7 @@ void LSTMGradCpuKernelMod::AddArgumentOp(const dnnl::memory::desc &src_desc, con
   AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
   AddArgument(DNNL_ARG_WEIGHTS_LAYER, weights_layer_desc_);
   AddArgument(DNNL_ARG_WEIGHTS_ITER, weights_iter_desc_);
+  AddArgument(DNNL_ARG_WEIGHTS_PROJECTION, weights_proj_desc_);
   AddArgument(DNNL_ARG_BIAS, bias_desc);
   AddArgument(DNNL_ARG_DST_LAYER, dst_desc);
   AddArgument(DNNL_ARG_DST_ITER, dst_h_desc);
@@ -197,6 +213,7 @@ void LSTMGradCpuKernelMod::AddArgumentOp(const dnnl::memory::desc &src_desc, con
   AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc);
   AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_layer_desc_);
   AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_iter_desc_);
+  AddArgument(DNNL_ARG_DIFF_WEIGHTS_PROJECTION, diff_weights_proj_desc_);
   AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc);
   AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
   AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
@@ -211,6 +228,7 @@ void LSTMGradCpuKernelMod::SetArgumentHandleOp(const std::vector<kernel::Address
   SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[kSrcIterCIdx]->addr);
   SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, GetDataHandle(weights_memory_));
   SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, GetDataHandle(weights_h_memory_));
+  SetArgumentHandle(DNNL_ARG_WEIGHTS_PROJECTION, GetDataHandle(weights_r_memory_));
   SetArgumentHandle(DNNL_ARG_BIAS, GetDataHandle(bias_memory_));
   SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[kDstLayerIdx]->addr);
   SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[kDstIterIdx]->addr);
@@ -221,6 +239,7 @@ void LSTMGradCpuKernelMod::SetArgumentHandleOp(const std::vector<kernel::Address
   SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[kSrcIterCIdx]->addr);
   SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, GetDataHandle(diff_weights_memory_));
   SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, GetDataHandle(diff_weights_h_memory_));
+  SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_PROJECTION, GetDataHandle(diff_weights_r_memory_));
   SetArgumentHandle(DNNL_ARG_DIFF_BIAS, GetDataHandle(diff_bias_memory_));
   SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[kDiffDstLayerIdx]->addr);
   SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[kDiffDstIterIdx]->addr);
@@ -241,13 +260,20 @@ bool LSTMGradCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                   const std::vector<kernel::AddressPtr> &outputs) {
   CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLstmGradInputsNum, kernel_name_);
   CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLstmGradOutputsNum, kernel_name_);
+  size_t offset = 0;
   SetDataHandle(user_weights_memory_, inputs[kInputWeightIndex]->addr);
-  SetDataHandle(user_weights_h_memory_, reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + weight_size_);
+  offset += weight_size_;
+  SetDataHandle(user_weights_h_memory_, reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + offset);
+  offset += weight_h_size_;
   Reorder(&user_weights_memory_, &weights_memory_);
   Reorder(&user_weights_h_memory_, &weights_h_memory_);
+  if (proj_size_ > 0) {
+    SetDataHandle(user_weights_r_memory_, reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + offset);
+    Reorder(&user_weights_r_memory_, &weights_r_memory_);
+    offset += weight_r_size_;
+  }
   if (has_bias_) {
-    SetDataHandle(bias_memory_,
-                  reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + weight_size_ + weight_h_size_);
+    SetDataHandle(bias_memory_, reinterpret_cast<float *>(inputs[kInputWeightIndex]->addr) + offset);
   } else {
     auto dst_ptr = GetDataHandle(bias_memory_);
     auto size = GetSize(bias_desc_);
@@ -256,16 +282,23 @@ bool LSTMGradCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs,
     }
   }

+  offset = 0;
   SetDataHandle(user_diff_weights_memory_, outputs[kOutputWeightIndex]->addr);
-  SetDataHandle(user_diff_weights_h_memory_,
-                reinterpret_cast<float *>(outputs[kOutputWeightIndex]->addr) + weight_size_);
+  offset += weight_size_;
+  SetDataHandle(user_diff_weights_h_memory_, reinterpret_cast<float *>(outputs[kOutputWeightIndex]->addr) + offset);
+  offset += weight_h_size_;
   ResetMemory(user_diff_weights_memory_, "user weights grad");
   ResetMemory(user_diff_weights_h_memory_, "user weights iter grad");
   ResetMemory(diff_weights_memory_, "weights grad");
   ResetMemory(diff_weights_h_memory_, "weights iter grad");
+  if (proj_size_ > 0) {
+    SetDataHandle(user_diff_weights_r_memory_, reinterpret_cast<float *>(outputs[kOutputWeightIndex]->addr) + offset);
+    ResetMemory(user_diff_weights_r_memory_, "user weights projection grad");
+    ResetMemory(diff_weights_r_memory_, "weights projection grad");
+    offset += weight_r_size_;
+  }
   if (has_bias_) {
-    SetDataHandle(diff_bias_memory_,
-                  reinterpret_cast<float *>(outputs[kOutputWeightIndex]->addr) + weight_size_ + weight_h_size_);
+    SetDataHandle(diff_bias_memory_, reinterpret_cast<float *>(outputs[kOutputWeightIndex]->addr) + offset);
   }
   auto dst_ptr = GetDataHandle(diff_bias_memory_);
   auto size = GetSize(diff_bias_desc_);
@@ -276,6 +309,9 @@ bool LSTMGradCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs,
   ExecutePrimitive();
   Reorder(&diff_weights_memory_, &user_diff_weights_memory_);
   Reorder(&diff_weights_h_memory_, &user_diff_weights_h_memory_);
+  if (proj_size_ > 0) {
+    Reorder(&diff_weights_r_memory_, &user_diff_weights_r_memory_);
+  }
   return true;
 }

diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.h
index f47bafc0..9768464d 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.h
@@ -75,34 +75,44 @@ class LSTMGradCpuKernelMod : public MKLCpuKernelMod {
   bool has_bias_{false};
   int64_t weight_size_{0};
   int64_t weight_h_size_{0};
+  int64_t weight_r_size_{0};
   int64_t input_size_{0};
   int64_t hidden_size_{0};
   int64_t num_layers_{0};
   int64_t batch_size_{0};
   int64_t seq_len_{0};
+  int64_t proj_size_{0};
+  int64_t real_hidden_size_{0};
   size_t reserve_size_{0};

   dnnl::memory::dims weights_dims_;
   dnnl::memory::dims weights_h_dims_;
+  dnnl::memory::dims weights_r_dims_;
   dnnl::memory::dims bias_dims_;
   dnnl::lstm_backward::primitive_desc prim_backward_desc_;

   dnnl::memory::desc weights_layer_desc_;
   dnnl::memory::desc weights_iter_desc_;
+  dnnl::memory::desc weights_proj_desc_;
   dnnl::memory::desc bias_desc_;
   dnnl::memory::desc diff_weights_layer_desc_;
   dnnl::memory::desc diff_weights_iter_desc_;
+  dnnl::memory::desc diff_weights_proj_desc_;
   dnnl::memory::desc diff_bias_desc_;
   dnnl::memory user_weights_memory_;
   dnnl::memory user_weights_h_memory_;
+  dnnl::memory user_weights_r_memory_;
   dnnl::memory weights_memory_;
   dnnl::memory weights_h_memory_;
+  dnnl::memory weights_r_memory_;
   dnnl::memory bias_memory_;
   dnnl::memory diff_weights_memory_;
   dnnl::memory diff_weights_h_memory_;
+  dnnl::memory diff_weights_r_memory_;
   dnnl::memory diff_bias_memory_;
   dnnl::memory user_diff_weights_memory_;
   dnnl::memory user_diff_weights_h_memory_;
+  dnnl::memory user_diff_weights_r_memory_;
 };
 }  // namespace kernel
 }  // namespace mindspore
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/mkl_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/mkl_cpu_kernel.h
index 7c8292df..0c98f8f6 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/mkl_cpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/mkl_cpu_kernel.h
@@ -89,6 +89,14 @@ auto GetWeightsIterDesc(const T &prim_desc) {
   return desc;
 }

+template <class T>
+auto GetWeightsProjectionDesc(const T &prim_desc) {
+  MS_LOG(DEBUG) << "begin to invoke " << demangle(typeid(T).name()) << "::weights_projection_desc()";
+  auto desc = prim_desc.weights_projection_desc();
+  MS_LOG(DEBUG) << "end to invoke " << demangle(typeid(T).name()) << "::weights_projection_desc()";
+  return desc;
+}
+
 template <class T>
 auto GetBiasDesc(const T &prim_desc) {
   MS_LOG(DEBUG) << "begin to invoke " << demangle(typeid(T).name()) << "::bias_desc()";
@@ -113,6 +121,14 @@ auto GetDiffWeightsIterDesc(const T &prim_desc) {
   return desc;
 }

+template <class T>
+auto GetDiffWeightsProjectionDesc(const T &prim_desc) {
+  MS_LOG(DEBUG) << "begin to invoke " << demangle(typeid(T).name()) << "::diff_weights_projection_desc()";
+  auto desc = prim_desc.diff_weights_projection_desc();
+  MS_LOG(DEBUG) << "end to invoke " << demangle(typeid(T).name()) << "::diff_weights_projection_desc()";
+  return desc;
+}
+
 template <class T>
 auto GetDiffBiasDesc(const T &prim_desc) {
   MS_LOG(DEBUG) << "begin to invoke " << demangle(typeid(T).name()) << "::diff_bias_desc()";
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/BUILD.gn b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/BUILD.gn
index 103e53b7..d27817be 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/BUILD.gn
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/BUILD.gn
@@ -501,6 +501,7 @@ infer_shape_sources = [
   "infer/custom_masked_fill_infer.c",
   "infer/custom_is_inf_infer.c",
   "infer/custom_tensor_scatter_max_infer.c",
+  "infer/custom_gather_d_grad_v2_infer.c",
   "infer/decoder_layer_infer.c",
   "infer/deconv2d_infer.c",
   "infer/depth_to_space_infer.c",
@@ -740,6 +741,7 @@ arm64_fp16_assembly_sources = [
   "assembly/fp16/Matmul12X16Fp16.S",
   "assembly/fp16/MatmulBaseFp16Neon.S",
   "assembly/fp16/MatmulFp16Opt.S",
+  "assembly/fp16/MatmulFp16OptV2.S",
   "assembly/fp16/MatmulFp16.S",
   "assembly/fp16/MatmulWinogradFp16.S",
   "assembly/fp16/MatVecMulFp16.S",
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S
new file mode 100644
index 00000000..2d901a3d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S
@@ -0,0 +1,2966 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef ENABLE_ARM64
+#include "nnacl/assembly_global.h"
+
+.text
+.align 5
+
+// void MatmulFp16OptV2(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
+//                      size_t depth, size_t row, size_t col, size_t stride, size_t writeMode)
+// x0: a
+// x1: b
+// x2: c
+// x3: bias
+// x4: act_type
+// x5: depth
+// x6: row
+// x7: col
+// x8: stride
+// x9: writeMode
+
+asm_function MatmulFp16OptV2
+    sub sp, sp, #192
+    st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64
+    st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64
+    stp x19, x20, [sp], #16
+    stp x21, x22, [sp], #16
+    stp x23, x24, [sp], #16
+    stp x29, x30, [sp], #16
+
+    ldr x8, [sp]
+    ldr x9, [sp, #8]  // writeMode
+    lsl x8, x8, #1  // stride * sizeof(float16_t)
+
+    lsl x15, x7, #1 // col * sizeof(float16_t)
+    lsl x16, x5, #1  // depth * sizeof(float16_t)
+    mov x11, x2
+    movi v7.8h, #0x46, lsl #8
+    subs x6, x6, #12
+    blt LoopRow8
+LoopRow12:
+    mov x11, x1  // reload matrixB
+    mov x12, x3  // reload bias
+    mov x13, x7  // reload col
+    mov x21, x2  // relocate output
+    subs x13, x13, #16
+    blt LoopCol12x8
+    LoopCol12x16:
+        mov x10, x0  // update matrixA
+        ld1 {v0.8h}, [x10], #16
+        mov x14, x5  // reload depth
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        cbnz x12, InitFromBias12x16
+        dup v8.2d, xzr
+        dup v9.2d, xzr
+        dup v10.2d, xzr
+        dup v11.2d, xzr
+        dup v12.2d, xzr
+        dup v13.2d, xzr
+        dup v14.2d, xzr
+        dup v15.2d, xzr
+        dup v16.2d, xzr
+        dup v17.2d, xzr
+        dup v18.2d, xzr
+        dup v19.2d, xzr
+        dup v20.2d, xzr
+        dup v21.2d, xzr
+        dup v22.2d, xzr
+        dup v23.2d, xzr
+        dup v24.2d, xzr
+        dup v25.2d, xzr
+        dup v26.2d, xzr
+        dup v27.2d, xzr
+        dup v28.2d, xzr
+        dup v29.2d, xzr
+        dup v30.2d, xzr
+        dup v31.2d, xzr
+        b Compute12x16Enter
+        InitFromBias12x16:
+            ld1 {v8.8h, v9.8h}, [x12]
+            ld1 {v10.8h, v11.8h}, [x12]
+            ld1 {v12.8h, v13.8h}, [x12]
+            ld1 {v14.8h, v15.8h}, [x12]
+            ld1 {v16.8h, v17.8h}, [x12]
+            ld1 {v18.8h, v19.8h}, [x12]
+            ld1 {v20.8h, v21.8h}, [x12]
+            ld1 {v22.8h, v23.8h}, [x12]
+            ld1 {v24.8h, v25.8h}, [x12]
+            ld1 {v26.8h, v27.8h}, [x12]
+            ld1 {v28.8h, v29.8h}, [x12]
+            ld1 {v30.8h, v31.8h}, [x12]
+            add x12, x12, #32
+    Compute12x16Enter:
+        bl Compute12x16Unit
+        Activation12x16:
+            cmp x4, #3
+            beq Relu612x16
+            cmp x4, #1
+            beq Relu12x16
+            b Write12x16
+
+            Relu612x16:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v9.8h, v9.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+                fmin v11.8h, v11.8h, v7.8h
+                fmin v12.8h, v12.8h, v7.8h
+                fmin v13.8h, v13.8h, v7.8h
+                fmin v14.8h, v14.8h, v7.8h
+                fmin v15.8h, v15.8h, v7.8h
+                fmin v16.8h, v16.8h, v7.8h
+                fmin v17.8h, v17.8h, v7.8h
+                fmin v18.8h, v18.8h, v7.8h
+                fmin v19.8h, v19.8h, v7.8h
+                fmin v20.8h, v20.8h, v7.8h
+                fmin v21.8h, v21.8h, v7.8h
+                fmin v22.8h, v22.8h, v7.8h
+                fmin v23.8h, v23.8h, v7.8h
+                fmin v24.8h, v24.8h, v7.8h
+                fmin v25.8h, v25.8h, v7.8h
+                fmin v26.8h, v26.8h, v7.8h
+                fmin v27.8h, v27.8h, v7.8h
+                fmin v28.8h, v28.8h, v7.8h
+                fmin v29.8h, v29.8h, v7.8h
+                fmin v30.8h, v30.8h, v7.8h
+                fmin v31.8h, v31.8h, v7.8h
+
+            Relu12x16:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v9.8h, v9.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+                fmax v11.8h, v11.8h, v6.8h
+                fmax v12.8h, v12.8h, v6.8h
+                fmax v13.8h, v13.8h, v6.8h
+                fmax v14.8h, v14.8h, v6.8h
+                fmax v15.8h, v15.8h, v6.8h
+                fmax v16.8h, v16.8h, v6.8h
+                fmax v17.8h, v17.8h, v6.8h
+                fmax v18.8h, v18.8h, v6.8h
+                fmax v19.8h, v19.8h, v6.8h
+                fmax v20.8h, v20.8h, v6.8h
+                fmax v21.8h, v21.8h, v6.8h
+                fmax v22.8h, v22.8h, v6.8h
+                fmax v23.8h, v23.8h, v6.8h
+                fmax v24.8h, v24.8h, v6.8h
+                fmax v25.8h, v25.8h, v6.8h
+                fmax v26.8h, v26.8h, v6.8h
+                fmax v27.8h, v27.8h, v6.8h
+                fmax v28.8h, v28.8h, v6.8h
+                fmax v29.8h, v29.8h, v6.8h
+                fmax v30.8h, v30.8h, v6.8h
+                fmax v31.8h, v31.8h, v6.8h
+            Write12x16:
+                mov x22, x21
+                add x23, x21, x8, lsl #2
+                add x24, x21, x8, lsl #3
+                st1 {v8.8h, v9.8h}, [x22], x8
+                st1 {v10.8h, v11.8h}, [x22], x8
+                st1 {v12.8h, v13.8h}, [x22], x8
+                st1 {v14.8h, v15.8h}, [x22]
+                st1 {v16.8h, v17.8h}, [x23], x8
+                st1 {v18.8h, v19.8h}, [x23], x8
+                st1 {v20.8h, v21.8h}, [x23], x8
+                st1 {v22.8h, v23.8h}, [x23]
+                st1 {v24.8h, v25.8h}, [x24], x8
+                st1 {v26.8h, v27.8h}, [x24], x8
+                st1 {v28.8h, v29.8h}, [x24], x8
+                st1 {v30.8h, v31.8h}, [x24]
+                add x21, x21, #32
+                subs x13, x13, #16
+                bge LoopCol12x16
+
+    LoopCol12x8:
+        adds x13, x13, #16
+        cbz x13, LoopRow12End
+        subs x13, x13, #8
+        blt LoopCol12x4
+        mov x10, x0  // update matrixA
+        ld1 {v0.8h}, [x10], #16
+        mov x14, x5  // reload depth
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        cbnz x12, InitFromBias12x8
+        dup v8.2d, xzr
+        dup v10.2d, xzr
+        dup v12.2d, xzr
+        dup v14.2d, xzr
+        dup v16.2d, xzr
+        dup v18.2d, xzr
+        dup v20.2d, xzr
+        dup v22.2d, xzr
+        dup v24.2d, xzr
+        dup v26.2d, xzr
+        dup v28.2d, xzr
+        dup v30.2d, xzr
+        b Compute12x8Enter
+        InitFromBias12x8:
+            ld1 {v8.8h}, [x12]
+            ld1 {v10.8h}, [x12]
+            ld1 {v12.8h}, [x12]
+            ld1 {v14.8h}, [x12]
+            ld1 {v16.8h}, [x12]
+            ld1 {v18.8h}, [x12]
+            ld1 {v20.8h}, [x12]
+            ld1 {v22.8h}, [x12]
+            ld1 {v24.8h}, [x12]
+            ld1 {v26.8h}, [x12]
+            ld1 {v28.8h}, [x12]
+            ld1 {v30.8h}, [x12]
+            add x12, x12, #16
+    Compute12x8Enter:
+        bl Compute12x8Unit
+        Activation12x8:
+            cmp x4, #3
+            beq Relu612x8
+            cmp x4, #1
+            beq Relu12x8
+            b Write12x8
+
+            Relu612x8:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+                fmin v12.8h, v12.8h, v7.8h
+                fmin v14.8h, v14.8h, v7.8h
+                fmin v16.8h, v16.8h, v7.8h
+                fmin v18.8h, v18.8h, v7.8h
+                fmin v20.8h, v20.8h, v7.8h
+                fmin v22.8h, v22.8h, v7.8h
+                fmin v24.8h, v24.8h, v7.8h
+                fmin v26.8h, v26.8h, v7.8h
+                fmin v28.8h, v28.8h, v7.8h
+                fmin v30.8h, v30.8h, v7.8h
+
+            Relu12x8:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+                fmax v12.8h, v12.8h, v6.8h
+                fmax v14.8h, v14.8h, v6.8h
+                fmax v16.8h, v16.8h, v6.8h
+                fmax v18.8h, v18.8h, v6.8h
+                fmax v20.8h, v20.8h, v6.8h
+                fmax v22.8h, v22.8h, v6.8h
+                fmax v24.8h, v24.8h, v6.8h
+                fmax v26.8h, v26.8h, v6.8h
+                fmax v28.8h, v28.8h, v6.8h
+                fmax v30.8h, v30.8h, v6.8h
+            Write12x8:
+                mov x22, x21
+                add x23, x21, x8, lsl #2
+                add x24, x21, x8, lsl #3
+                st1 {v8.8h}, [x22], x8
+                st1 {v10.8h}, [x22], x8
+                st1 {v12.8h}, [x22], x8
+                st1 {v14.8h}, [x22]
+                st1 {v16.8h}, [x23], x8
+                st1 {v18.8h}, [x23], x8
+                st1 {v20.8h}, [x23], x8
+                st1 {v22.8h}, [x23]
+                st1 {v24.8h}, [x24], x8
+                st1 {v26.8h}, [x24], x8
+                st1 {v28.8h}, [x24], x8
+                st1 {v30.8h}, [x24]
+                add x21, x21, #16
+                subs x13, x13, #8
+
+    LoopCol12x4:
+        adds x13, x13, #8
+        cbz x13, LoopRow12End
+    LoopCol12x4Core:
+        mov x10, x0  // update matrixA
+        ld1 {v0.8h}, [x10], #16
+        mov x14, x5  // reload depth
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h}, [x11], #8
+        cbnz x12, InitFromBias12x4
+        dup v8.2s, wzr
+        dup v10.2s, wzr
+        dup v12.2s, wzr
+        dup v14.2s, wzr
+        dup v16.2s, wzr
+        dup v18.2s, wzr
+        dup v20.2s, wzr
+        dup v22.2s, wzr
+        dup v24.2s, wzr
+        dup v26.2s, wzr
+        dup v28.2s, wzr
+        dup v30.2s, wzr
+        b Compute12x4Enter
+        InitFromBias12x4:
+            ld1 {v8.4h}, [x12]
+            ld1 {v10.4h}, [x12]
+            ld1 {v12.4h}, [x12]
+            ld1 {v14.4h}, [x12]
+            ld1 {v16.4h}, [x12]
+            ld1 {v18.4h}, [x12]
+            ld1 {v20.4h}, [x12]
+            ld1 {v22.4h}, [x12]
+            ld1 {v24.4h}, [x12]
+            ld1 {v26.4h}, [x12]
+            ld1 {v28.4h}, [x12]
+            ld1 {v30.4h}, [x12]
+            add x12, x12, #8
+    Compute12x4Enter:
+        bl Compute12x4Unit
+        Activation12x4:
+            cmp x4, #3
+            beq Relu612x4
+            cmp x4, #1
+            beq Relu12x4
+            b Write12x4
+
+            Relu612x4:
+                fmin v8.4h, v8.4h, v7.4h
+                fmin v10.4h, v10.4h, v7.4h
+                fmin v12.4h, v12.4h, v7.4h
+                fmin v14.4h, v14.4h, v7.4h
+                fmin v16.4h, v16.4h, v7.4h
+                fmin v18.4h, v18.4h, v7.4h
+                fmin v20.4h, v20.4h, v7.4h
+                fmin v22.4h, v22.4h, v7.4h
+                fmin v24.4h, v24.4h, v7.4h
+                fmin v26.4h, v26.4h, v7.4h
+                fmin v28.4h, v28.4h, v7.4h
+                fmin v30.4h, v30.4h, v7.4h
+
+            Relu12x4:
+                dup v6.4h, wzr
+                fmax v8.4h, v8.4h, v6.4h
+                fmax v10.4h, v10.4h, v6.4h
+                fmax v12.4h, v12.4h, v6.4h
+                fmax v14.4h, v14.4h, v6.4h
+                fmax v16.4h, v16.4h, v6.4h
+                fmax v18.4h, v18.4h, v6.4h
+                fmax v20.4h, v20.4h, v6.4h
+                fmax v22.4h, v22.4h, v6.4h
+                fmax v24.4h, v24.4h, v6.4h
+                fmax v26.4h, v26.4h, v6.4h
+                fmax v28.4h, v28.4h, v6.4h
+                fmax v30.4h, v30.4h, v6.4h
+            Write12x4:
+                mov x22, x21
+                add x23, x21, x8, lsl #2
+                add x24, x21, x8, lsl #3
+                cmp x13, #1
+                beq Write12x1
+                cmp x13, #2
+                beq Write12x2
+                cmp x13, #3
+                beq Write12x3
+                st1 {v8.4h}, [x22], x8
+                st1 {v10.4h}, [x22], x8
+                st1 {v12.4h}, [x22], x8
+                st1 {v14.4h}, [x22]
+                st1 {v16.4h}, [x23], x8
+                st1 {v18.4h}, [x23], x8
+                st1 {v20.4h}, [x23], x8
+                st1 {v22.4h}, [x23]
+                st1 {v24.4h}, [x24], x8
+                st1 {v26.4h}, [x24], x8
+                st1 {v28.4h}, [x24], x8
+                st1 {v30.4h}, [x24]
+                add x21, x21, #8
+                subs x13, x13, #4
+                bgt LoopCol12x4Core
+                b LoopRow12End
+            Write12x1:
+                st1 {v8.h}[0], [x22], x8
+                st1 {v10.h}[0], [x22], x8
+                st1 {v12.h}[0], [x22], x8
+                st1 {v14.h}[0], [x22]
+                st1 {v16.h}[0], [x23], x8
+                st1 {v18.h}[0], [x23], x8
+                st1 {v20.h}[0], [x23], x8
+                st1 {v22.h}[0], [x23]
+                st1 {v24.h}[0], [x24], x8
+                st1 {v26.h}[0], [x24], x8
+                st1 {v28.h}[0], [x24], x8
+                st1 {v30.h}[0], [x24]
+                b LoopRow12End
+            Write12x2:
+                st1 {v8.s}[0], [x22], x8
+                st1 {v10.s}[0], [x22], x8
+                st1 {v12.s}[0], [x22], x8
+                st1 {v14.s}[0], [x22]
+                st1 {v16.s}[0], [x23], x8
+                st1 {v18.s}[0], [x23], x8
+                st1 {v20.s}[0], [x23], x8
+                st1 {v22.s}[0], [x23]
+                st1 {v24.s}[0], [x24], x8
+                st1 {v26.s}[0], [x24], x8
+                st1 {v28.s}[0], [x24], x8
+                st1 {v30.s}[0], [x24]
+                b LoopRow12End
+            Write12x3:
+                add x23, x22, #4
+                st1 {v8.s}[0], [x22], x8
+                st1 {v8.h}[2], [x23], x8
+                st1 {v10.s}[0], [x22], x8
+                st1 {v10.h}[2], [x23], x8
+                st1 {v12.s}[0], [x22], x8
+                st1 {v12.h}[2], [x23], x8
+                st1 {v14.s}[0], [x22], x8
+                st1 {v14.h}[2], [x23], x8
+                st1 {v16.s}[0], [x22], x8
+                st1 {v16.h}[2], [x23], x8
+                st1 {v18.s}[0], [x22], x8
+                st1 {v18.h}[2], [x23], x8
+                st1 {v20.s}[0], [x22], x8
+                st1 {v20.h}[2], [x23], x8
+                st1 {v22.s}[0], [x22], x8
+                st1 {v22.h}[2], [x23], x8
+                st1 {v24.s}[0], [x22], x8
+                st1 {v24.h}[2], [x23], x8
+                st1 {v26.s}[0], [x22], x8
+                st1 {v26.h}[2], [x23], x8
+                st1 {v28.s}[0], [x22], x8
+                st1 {v28.h}[2], [x23], x8
+                st1 {v30.s}[0], [x22]
+                st1 {v30.h}[2], [x23]
+            LoopRow12End:
+                add x0, x0, x16, lsl #3
+                add x0, x0, x16, lsl #2
+                add x2, x2, x8, lsl #3
+                add x2, x2, x8, lsl #2
+                subs x6, x6, #12
+                bge LoopRow12
+
+LoopRow8:
+    adds x6, x6,#12
+    cbz x6, End
+    subs x6, x6, #8
+    blt LoopRow4
+    mov x11, x1  // reload matrixB
+    mov x12, x3  // reload bias
+    mov x13, x7  // reload col
+    mov x21, x2  // relocate output
+    subs x13, x13, #16
+    blt LoopCol8x8
+    LoopCol8x16:
+        mov x10, x0  // update matrixA
+        ld1 {v0.8h}, [x10], #16
+        mov x14, x5  // reload depth
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        cbnz x12, InitFromBias8x16
+        dup v8.2d, xzr
+        dup v9.2d, xzr
+        dup v10.2d, xzr
+        dup v11.2d, xzr
+        dup v12.2d, xzr
+        dup v13.2d, xzr
+        dup v14.2d, xzr
+        dup v15.2d, xzr
+        dup v16.2d, xzr
+        dup v17.2d, xzr
+        dup v18.2d, xzr
+        dup v19.2d, xzr
+        dup v20.2d, xzr
+        dup v21.2d, xzr
+        dup v22.2d, xzr
+        dup v23.2d, xzr
+        b Compute8x16Enter
+        InitFromBias8x16:
+            ld1 {v8.8h, v9.8h}, [x12]
+            ld1 {v10.8h, v11.8h}, [x12]
+            ld1 {v12.8h, v13.8h}, [x12]
+            ld1 {v14.8h, v15.8h}, [x12]
+            ld1 {v16.8h, v17.8h}, [x12]
+            ld1 {v18.8h, v19.8h}, [x12]
+            ld1 {v20.8h, v21.8h}, [x12]
+            ld1 {v22.8h, v23.8h}, [x12]
+            add x12, x12, #32
+    Compute8x16Enter:
+        bl Compute8x16Unit
+        Activation8x16:
+            cmp x4, #3
+            beq Relu68x16
+            cmp x4, #1
+            beq Relu8x16
+            b Write8x16
+
+            Relu68x16:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v9.8h, v9.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+                fmin v11.8h, v11.8h, v7.8h
+                fmin v12.8h, v12.8h, v7.8h
+                fmin v13.8h, v13.8h, v7.8h
+                fmin v14.8h, v14.8h, v7.8h
+                fmin v15.8h, v15.8h, v7.8h
+                fmin v16.8h, v16.8h, v7.8h
+                fmin v17.8h, v17.8h, v7.8h
+                fmin v18.8h, v18.8h, v7.8h
+                fmin v19.8h, v19.8h, v7.8h
+                fmin v20.8h, v20.8h, v7.8h
+                fmin v21.8h, v21.8h, v7.8h
+                fmin v22.8h, v22.8h, v7.8h
+                fmin v23.8h, v23.8h, v7.8h
+
+            Relu8x16:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v9.8h, v9.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+                fmax v11.8h, v11.8h, v6.8h
+                fmax v12.8h, v12.8h, v6.8h
+                fmax v13.8h, v13.8h, v6.8h
+                fmax v14.8h, v14.8h, v6.8h
+                fmax v15.8h, v15.8h, v6.8h
+                fmax v16.8h, v16.8h, v6.8h
+                fmax v17.8h, v17.8h, v6.8h
+                fmax v18.8h, v18.8h, v6.8h
+                fmax v19.8h, v19.8h, v6.8h
+                fmax v20.8h, v20.8h, v6.8h
+                fmax v21.8h, v21.8h, v6.8h
+                fmax v22.8h, v22.8h, v6.8h
+                fmax v23.8h, v23.8h, v6.8h
+            Write8x16:
+                mov x22, x21
+                add x23, x21, x8, lsl #2
+                st1 {v8.8h, v9.8h}, [x22], x8
+                st1 {v10.8h, v11.8h}, [x22], x8
+                st1 {v12.8h, v13.8h}, [x22], x8
+                st1 {v14.8h, v15.8h}, [x22]
+                st1 {v16.8h, v17.8h}, [x23], x8
+                st1 {v18.8h, v19.8h}, [x23], x8
+                st1 {v20.8h, v21.8h}, [x23], x8
+                st1 {v22.8h, v23.8h}, [x23]
+                add x21, x21, #32
+                subs x13, x13, #16
+                bge LoopCol8x16
+
+    LoopCol8x8:
+        adds x13, x13, #16
+        cbz x13, LoopRow8End
+        subs x13, x13, #8
+        blt LoopCol8x4
+        mov x10, x0  // update matrixA
+        ld1 {v0.8h}, [x10], #16
+        mov x14, x5  // reload depth
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        cbnz x12, InitFromBias8x8
+        dup v8.2d, xzr
+        dup v10.2d, xzr
+        dup v12.2d, xzr
+        dup v14.2d, xzr
+        dup v16.2d, xzr
+        dup v18.2d, xzr
+        dup v20.2d, xzr
+        dup v22.2d, xzr
+        b Compute8x8Enter
+        InitFromBias8x8:
+            ld1 {v8.8h}, [x12]
+            ld1 {v10.8h}, [x12]
+            ld1 {v12.8h}, [x12]
+            ld1 {v14.8h}, [x12]
+            ld1 {v16.8h}, [x12]
+            ld1 {v18.8h}, [x12]
+            ld1 {v20.8h}, [x12]
+            ld1 {v22.8h}, [x12]
+            add x12, x12, #16
+    Compute8x8Enter:
+        bl Compute8x8Unit
+        Activation8x8:
+            cmp x4, #3
+            beq Relu68x8
+            cmp x4, #1
+            beq Relu8x8
+            b Write8x8
+
+            Relu68x8:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+                fmin v12.8h, v12.8h, v7.8h
+                fmin v14.8h, v14.8h, v7.8h
+                fmin v16.8h, v16.8h, v7.8h
+                fmin v18.8h, v18.8h, v7.8h
+                fmin v20.8h, v20.8h, v7.8h
+                fmin v22.8h, v22.8h, v7.8h
+
+            Relu8x8:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+                fmax v12.8h, v12.8h, v6.8h
+                fmax v14.8h, v14.8h, v6.8h
+                fmax v16.8h, v16.8h, v6.8h
+                fmax v18.8h, v18.8h, v6.8h
+                fmax v20.8h, v20.8h, v6.8h
+                fmax v22.8h, v22.8h, v6.8h
+            Write8x8:
+                mov x22, x21
+                add x23, x21, x8, lsl #2
+                st1 {v8.8h}, [x22], x8
+                st1 {v10.8h}, [x22], x8
+                st1 {v12.8h}, [x22], x8
+                st1 {v14.8h}, [x22]
+                st1 {v16.8h}, [x23], x8
+                st1 {v18.8h}, [x23], x8
+                st1 {v20.8h}, [x23], x8
+                st1 {v22.8h}, [x23]
+                add x21, x21, #16
+                subs x13, x13, #8
+
+    LoopCol8x4:
+        adds x13, x13, #8
+        cbz x13, LoopRow8End
+    LoopCol8x4Core:
+        mov x10, x0  // update matrixA
+        ld1 {v0.8h}, [x10], #16
+        mov x14, x5  // reload depth
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h}, [x11], #8
+        cbnz x12, InitFromBias8x4
+        dup v8.2s, wzr
+        dup v10.2s, wzr
+        dup v12.2s, wzr
+        dup v14.2s, wzr
+        dup v16.2s, wzr
+        dup v18.2s, wzr
+        dup v20.2s, wzr
+        dup v22.2s, wzr
+        b Compute8x4Enter
+        InitFromBias8x4:
+            ld1 {v8.4h}, [x12]
+            ld1 {v10.4h}, [x12]
+            ld1 {v12.4h}, [x12]
+            ld1 {v14.4h}, [x12]
+            ld1 {v16.4h}, [x12]
+            ld1 {v18.4h}, [x12]
+            ld1 {v20.4h}, [x12]
+            ld1 {v22.4h}, [x12]
+            add x12, x12, #8
+    Compute8x4Enter:
+        bl Compute8x4Unit
+        Activation8x4:
+            cmp x4, #3
+            beq Relu68x4
+            cmp x4, #1
+            beq Relu8x4
+            b Write8x4
+
+            Relu68x4:
+                fmin v8.4h, v8.4h, v7.4h
+                fmin v10.4h, v10.4h, v7.4h
+                fmin v12.4h, v12.4h, v7.4h
+                fmin v14.4h, v14.4h, v7.4h
+                fmin v16.4h, v16.4h, v7.4h
+                fmin v18.4h, v18.4h, v7.4h
+                fmin v20.4h, v20.4h, v7.4h
+                fmin v22.4h, v22.4h, v7.4h
+
+            Relu8x4:
+                dup v6.4h, wzr
+                fmax v8.4h, v8.4h, v6.4h
+                fmax v10.4h, v10.4h, v6.4h
+                fmax v12.4h, v12.4h, v6.4h
+                fmax v14.4h, v14.4h, v6.4h
+                fmax v16.4h, v16.4h, v6.4h
+                fmax v18.4h, v18.4h, v6.4h
+                fmax v20.4h, v20.4h, v6.4h
+                fmax v22.4h, v22.4h, v6.4h
+            Write8x4:
+                mov x22, x21
+                add x23, x21, x8, lsl #2
+                cmp x13, #1
+                beq Write8x1
+                cmp x13, #2
+                beq Write8x2
+                cmp x13, #3
+                beq Write8x3
+                st1 {v8.4h}, [x22], x8
+                st1 {v10.4h}, [x22], x8
+                st1 {v12.4h}, [x22], x8
+                st1 {v14.4h}, [x22]
+                st1 {v16.4h}, [x23], x8
+                st1 {v18.4h}, [x23], x8
+                st1 {v20.4h}, [x23], x8
+                st1 {v22.4h}, [x23]
+                add x21, x21, #8
+                subs x13, x13, #4
+                bgt LoopCol8x4Core
+                b LoopRow8End
+            Write8x1:
+                st1 {v8.h}[0], [x22], x8
+                st1 {v10.h}[0], [x22], x8
+                st1 {v12.h}[0], [x22], x8
+                st1 {v14.h}[0], [x22]
+                st1 {v16.h}[0], [x23], x8
+                st1 {v18.h}[0], [x23], x8
+                st1 {v20.h}[0], [x23], x8
+                st1 {v22.h}[0], [x23]
+                b LoopRow8End
+            Write8x2:
+                st1 {v8.s}[0], [x22], x8
+                st1 {v10.s}[0], [x22], x8
+                st1 {v12.s}[0], [x22], x8
+                st1 {v14.s}[0], [x22]
+                st1 {v16.s}[0], [x23], x8
+                st1 {v18.s}[0], [x23], x8
+                st1 {v20.s}[0], [x23], x8
+                st1 {v22.s}[0], [x23]
+                b LoopRow8End
+            Write8x3:
+                add x23, x22, #4
+                st1 {v8.s}[0], [x22], x8
+                st1 {v8.h}[2], [x23], x8
+                st1 {v10.s}[0], [x22], x8
+                st1 {v10.h}[2], [x23], x8
+                st1 {v12.s}[0], [x22], x8
+                st1 {v12.h}[2], [x23], x8
+                st1 {v14.s}[0], [x22], x8
+                st1 {v14.h}[2], [x23], x8
+                st1 {v16.s}[0], [x22], x8
+                st1 {v16.h}[2], [x23], x8
+                st1 {v18.s}[0], [x22], x8
+                st1 {v18.h}[2], [x23], x8
+                st1 {v20.s}[0], [x22], x8
+                st1 {v20.h}[2], [x23], x8
+                st1 {v22.s}[0], [x22], x8
+                st1 {v22.h}[2], [x23], x8
+            LoopRow8End:
+                add x0, x0, x16, lsl #3
+                add x2, x2, x8, lsl #3
+                subs x6, x6, #8
+
+LoopRow4:
+    adds x6, x6, #8
+    cbz x6, End
+    subs x6, x6, #4
+    blt LoopRowTail
+    mov x11, x1  // reload matrixB
+    mov x12, x3  // reload bias
+    mov x13, x7  // reload col
+    mov x21, x2  // relocate output
+    subs x13, x13, #16
+    blt LoopCol4x8
+    LoopCol4x16:
+        mov x10, x0  // update matrixA
+        ld1 {v0.4h}, [x10], #8
+        mov x14, x5  // reload depth
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        cbnz x12, InitFromBias4x16
+        dup v8.2d, xzr
+        dup v9.2d, xzr
+        dup v10.2d, xzr
+        dup v11.2d, xzr
+        dup v12.2d, xzr
+        dup v13.2d, xzr
+        dup v14.2d, xzr
+        dup v15.2d, xzr
+        b Compute4x16Enter
+        InitFromBias4x16:
+            ld1 {v8.8h, v9.8h}, [x12]
+            ld1 {v10.8h, v11.8h}, [x12]
+            ld1 {v12.8h, v13.8h}, [x12]
+            ld1 {v14.8h, v15.8h}, [x12]
+            add x12, x12, #32
+    Compute4x16Enter:
+        bl Compute4x16Unit
+        Activation4x16:
+            cmp x4, #3
+            beq Relu64x16
+            cmp x4, #1
+            beq Relu4x16
+            b Write4x16
+
+            Relu64x16:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v9.8h, v9.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+                fmin v11.8h, v11.8h, v7.8h
+                fmin v12.8h, v12.8h, v7.8h
+                fmin v13.8h, v13.8h, v7.8h
+                fmin v14.8h, v14.8h, v7.8h
+                fmin v15.8h, v15.8h, v7.8h
+
+            Relu4x16:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v9.8h, v9.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+                fmax v11.8h, v11.8h, v6.8h
+                fmax v12.8h, v12.8h, v6.8h
+                fmax v13.8h, v13.8h, v6.8h
+                fmax v14.8h, v14.8h, v6.8h
+                fmax v15.8h, v15.8h, v6.8h
+            Write4x16:
+                mov x22, x21
+                st1 {v8.8h, v9.8h}, [x22], x8
+                st1 {v10.8h, v11.8h}, [x22], x8
+                st1 {v12.8h, v13.8h}, [x22], x8
+                st1 {v14.8h, v15.8h}, [x22]
+                add x21, x21, #32
+                subs x13, x13, #16
+                bge LoopCol4x16
+
+    LoopCol4x8:
+        adds x13, x13, #16
+        cbz x13, LoopRow4End
+        subs x13, x13, #8
+        blt LoopCol4x4
+        mov x10, x0  // update matrixA
+        ld1 {v0.4h}, [x10], #8
+        mov x14, x5  // reload depth
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        cbnz x12, InitFromBias4x8
+        dup v8.2d, xzr
+        dup v10.2d, xzr
+        dup v12.2d, xzr
+        dup v14.2d, xzr
+        b Compute4x8Enter
+        InitFromBias4x8:
+            ld1 {v8.8h}, [x12]
+            ld1 {v10.8h}, [x12]
+            ld1 {v12.8h}, [x12]
+            ld1 {v14.8h}, [x12]
+            add x12, x12, #16
+    Compute4x8Enter:
+        bl Compute4x8Unit
+        Activation4x8:
+            cmp x4, #3
+            beq Relu64x8
+            cmp x4, #1
+            beq Relu4x8
+            b Write4x8
+
+            Relu64x8:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+                fmin v12.8h, v12.8h, v7.8h
+                fmin v14.8h, v14.8h, v7.8h
+
+            Relu4x8:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+                fmax v12.8h, v12.8h, v6.8h
+                fmax v14.8h, v14.8h, v6.8h
+            Write4x8:
+                mov x22, x21
+                st1 {v8.8h}, [x22], x8
+                st1 {v10.8h}, [x22], x8
+                st1 {v12.8h}, [x22], x8
+                st1 {v14.8h}, [x22]
+                add x21, x21, #16
+                subs x13, x13, #8
+
+    LoopCol4x4:
+        adds x13, x13, #8
+        cbz x13, LoopRow4End
+    LoopCol4x4Core:
+        mov x10, x0  // update matrixA
+        ld1 {v0.4h}, [x10], #8
+        mov x14, x5  // reload depth
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h}, [x11], #8
+        cbnz x12, InitFromBias4x4
+        dup v8.2s, wzr
+        dup v10.2s, wzr
+        dup v12.2s, wzr
+        dup v14.2s, wzr
+        b Compute4x4Enter
+        InitFromBias4x4:
+            ld1 {v8.4h}, [x12]
+            ld1 {v10.4h}, [x12]
+            ld1 {v12.4h}, [x12]
+            ld1 {v14.4h}, [x12]
+            add x12, x12, #8
+    Compute4x4Enter:
+        bl Compute4x4Unit
+        Activation4x4:
+            cmp x4, #3
+            beq Relu64x4
+            cmp x4, #1
+            beq Relu4x4
+            b Write4x4
+
+            Relu64x4:
+                fmin v8.4h, v8.4h, v7.4h
+                fmin v10.4h, v10.4h, v7.4h
+                fmin v12.4h, v12.4h, v7.4h
+                fmin v14.4h, v14.4h, v7.4h
+
+            Relu4x4:
+                dup v6.4h, wzr
+                fmax v8.4h, v8.4h, v6.4h
+                fmax v10.4h, v10.4h, v6.4h
+                fmax v12.4h, v12.4h, v6.4h
+                fmax v14.4h, v14.4h, v6.4h
+            Write4x4:
+                mov x22, x21
+                cmp x13, #1
+                beq Write4x1
+                cmp x13, #2
+                beq Write4x2
+                cmp x13, #3
+                beq Write4x3
+                st1 {v8.4h}, [x22], x8
+                st1 {v10.4h}, [x22], x8
+                st1 {v12.4h}, [x22], x8
+                st1 {v14.4h}, [x22]
+                add x21, x21, #8
+                subs x13, x13, #4
+                bgt LoopCol4x4Core
+                b LoopRow4End
+            Write4x1:
+                st1 {v8.h}[0], [x22], x8
+                st1 {v10.h}[0], [x22], x8
+                st1 {v12.h}[0], [x22], x8
+                st1 {v14.h}[0], [x22]
+                b LoopRow4End
+            Write4x2:
+                st1 {v8.s}[0], [x22], x8
+                st1 {v10.s}[0], [x22], x8
+                st1 {v12.s}[0], [x22], x8
+                st1 {v14.s}[0], [x22]
+                b LoopRow4End
+            Write4x3:
+                add x23, x22, #4
+                st1 {v8.s}[0], [x22], x8
+                st1 {v8.h}[2], [x23], x8
+                st1 {v10.s}[0], [x22], x8
+                st1 {v10.h}[2], [x23], x8
+                st1 {v12.s}[0], [x22], x8
+                st1 {v12.h}[2], [x23], x8
+                st1 {v14.s}[0], [x22], x8
+                st1 {v14.h}[2], [x23], x8
+            LoopRow4End:
+                add x0, x0, x16, lsl #2
+                add x2, x2, x8, lsl #2
+                subs x6, x6, #4
+
+LoopRowTail:
+    adds x6, x6, #4
+    cbz x6, End
+    cmp x6, #1
+    beq LoopRow1
+    cmp x6, #2
+    beq LoopRow2
+    // LoopRow3
+    mov x11, x1  // reload matrixB
+    mov x12, x3  // reload bias
+    mov x13, x7  // reload col
+    mov x21, x2  // relocate output
+    subs x13, x13, #16
+    blt LoopCol3x8
+    LoopCol3x16:
+        mov x10, x0  // update matrixA
+        mov x14, x5  // reload depth
+        cbnz x12, InitFromBias3x16
+        dup v8.2d, xzr
+        dup v9.2d, xzr
+        dup v10.2d, xzr
+        dup v11.2d, xzr
+        dup v12.2d, xzr
+        dup v13.2d, xzr
+        b Compute3x16Enter
+        InitFromBias3x16:
+            ld1 {v8.8h, v9.8h}, [x12]
+            ld1 {v10.8h, v11.8h}, [x12]
+            ld1 {v12.8h, v13.8h}, [x12]
+            add x12, x12, #32
+    Compute3x16Enter:
+        bl Compute3x16Unit
+        Activation3x16:
+            cmp x4, #3
+            beq Relu63x16
+            cmp x4, #1
+            beq Relu3x16
+            b Write3x16
+
+            Relu63x16:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v9.8h, v9.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+                fmin v11.8h, v11.8h, v7.8h
+                fmin v12.8h, v12.8h, v7.8h
+                fmin v13.8h, v13.8h, v7.8h
+
+            Relu3x16:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v9.8h, v9.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+                fmax v11.8h, v11.8h, v6.8h
+                fmax v12.8h, v12.8h, v6.8h
+                fmax v13.8h, v13.8h, v6.8h
+            Write3x16:
+                mov x22, x21
+                st1 {v8.8h, v9.8h}, [x22], x8
+                st1 {v10.8h, v11.8h}, [x22], x8
+                st1 {v12.8h, v13.8h}, [x22]
+                add x21, x21, #32
+                subs x13, x13, #16
+                bge LoopCol3x16
+
+    LoopCol3x8:
+        adds x13, x13, #16
+        cbz x13, End
+        subs x13, x13, #8
+        blt LoopCol3x4
+        mov x10, x0  // update matrixA
+        mov x14, x5  // reload depth
+        cbnz x12, InitFromBias3x8
+        dup v8.2d, xzr
+        dup v10.2d, xzr
+        dup v12.2d, xzr
+        b Compute3x8Enter
+        InitFromBias3x8:
+            ld1 {v8.8h}, [x12]
+            ld1 {v10.8h}, [x12]
+            ld1 {v12.8h}, [x12]
+            add x12, x12, #16
+    Compute3x8Enter:
+        bl Compute3x8Unit
+        Activation3x8:
+            cmp x4, #3
+            beq Relu63x8
+            cmp x4, #1
+            beq Relu3x8
+            b Write3x8
+
+            Relu63x8:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+                fmin v12.8h, v12.8h, v7.8h
+
+            Relu3x8:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+                fmax v12.8h, v12.8h, v6.8h
+            Write3x8:
+                mov x22, x21
+                st1 {v8.8h}, [x22], x8
+                st1 {v10.8h}, [x22], x8
+                st1 {v12.8h}, [x22]
+                add x21, x21, #16
+                subs x13, x13, #8
+
+    LoopCol3x4:
+        adds x13, x13, #8
+        cbz x13, End
+    LoopCol3x4Core:
+        mov x10, x0  // update matrixA
+        mov x14, x5  // reload depth
+        cbnz x12, InitFromBias3x4
+        dup v8.2s, wzr
+        dup v10.2s, wzr
+        dup v12.2s, wzr
+        b Compute3x4Enter
+        InitFromBias3x4:
+            ld1 {v8.4h}, [x12]
+            ld1 {v10.4h}, [x12]
+            ld1 {v12.4h}, [x12]
+            add x12, x12, #8
+    Compute3x4Enter:
+        bl Compute3x4Unit
+        Activation3x4:
+            cmp x4, #3
+            beq Relu63x4
+            cmp x4, #1
+            beq Relu3x4
+            b Write3x4
+
+            Relu63x4:
+                fmin v8.4h, v8.4h, v7.4h
+                fmin v10.4h, v10.4h, v7.4h
+                fmin v12.4h, v12.4h, v7.4h
+
+            Relu3x4:
+                dup v6.4h, wzr
+                fmax v8.4h, v8.4h, v6.4h
+                fmax v10.4h, v10.4h, v6.4h
+                fmax v12.4h, v12.4h, v6.4h
+            Write3x4:
+                mov x22, x21
+                cmp x13, #1
+                beq Write3x1
+                cmp x13, #2
+                beq Write3x2
+                cmp x13, #3
+                beq Write3x3
+                st1 {v8.4h}, [x22], x8
+                st1 {v10.4h}, [x22], x8
+                st1 {v12.4h}, [x22]
+                add x21, x21, #8
+                subs x13, x13, #4
+                bgt LoopCol3x4Core
+                b End
+            Write3x1:
+                st1 {v8.h}[0], [x22], x8
+                st1 {v10.h}[0], [x22], x8
+                st1 {v12.h}[0], [x22]
+                b End
+            Write3x2:
+                st1 {v8.s}[0], [x22], x8
+                st1 {v10.s}[0], [x22], x8
+                st1 {v12.s}[0], [x22]
+                b End
+            Write3x3:
+                add x23, x22, #4
+                st1 {v8.s}[0], [x22], x8
+                st1 {v8.h}[2], [x23], x8
+                st1 {v10.s}[0], [x22], x8
+                st1 {v10.h}[2], [x23], x8
+                st1 {v12.s}[0], [x22], x8
+                st1 {v12.h}[2], [x23], x8
+                b End
+
+LoopRow2:
+    mov x11, x1  // reload matrixB
+    mov x12, x3  // reload bias
+    mov x13, x7  // reload col
+    mov x21, x2  // relocate output
+    subs x13, x13, #16
+    blt LoopCol2x8
+    LoopCol2x16:
+        mov x10, x0  // update matrixA
+        mov x14, x5  // reload depth
+        cbnz x12, InitFromBias2x16
+        dup v8.2d, xzr
+        dup v9.2d, xzr
+        dup v10.2d, xzr
+        dup v11.2d, xzr
+        b Compute2x16Enter
+        InitFromBias2x16:
+            ld1 {v8.8h, v9.8h}, [x12]
+            ld1 {v10.8h, v11.8h}, [x12]
+            add x12, x12, #32
+    Compute2x16Enter:
+        bl Compute2x16Unit
+        Activation2x16:
+            cmp x4, #3
+            beq Relu62x16
+            cmp x4, #1
+            beq Relu2x16
+            b Write2x16
+
+            Relu62x16:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v9.8h, v9.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+                fmin v11.8h, v11.8h, v7.8h
+
+            Relu2x16:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v9.8h, v9.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+                fmax v11.8h, v11.8h, v6.8h
+            Write2x16:
+                mov x22, x21
+                st1 {v8.8h, v9.8h}, [x22], x8
+                st1 {v10.8h, v11.8h}, [x22]
+                add x21, x21, #32
+                subs x13, x13, #16
+                bge LoopCol2x16
+
+    LoopCol2x8:
+        adds x13, x13, #16
+        cbz x13, End
+        subs x13, x13, #8
+        blt LoopCol2x4
+        mov x10, x0  // update matrixA
+        mov x14, x5  // reload depth
+        cbnz x12, InitFromBias2x8
+        dup v8.2d, xzr
+        dup v10.2d, xzr
+        b Compute2x8Enter
+        InitFromBias2x8:
+            ld1 {v8.8h}, [x12]
+            ld1 {v10.8h}, [x12]
+            add x12, x12, #16
+    Compute2x8Enter:
+        bl Compute2x8Unit
+        Activation2x8:
+            cmp x4, #3
+            beq Relu62x8
+            cmp x4, #1
+            beq Relu2x8
+            b Write2x8
+
+            Relu62x8:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v10.8h, v10.8h, v7.8h
+
+            Relu2x8:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v10.8h, v10.8h, v6.8h
+            Write2x8:
+                mov x22, x21
+                st1 {v8.8h}, [x22], x8
+                st1 {v10.8h}, [x22]
+                add x21, x21, #16
+                subs x13, x13, #8
+
+    LoopCol2x4:
+        adds x13, x13, #8
+        cbz x13, End
+    LoopCol2x4Core:
+        mov x10, x0  // update matrixA
+        mov x14, x5  // reload depth
+        cbnz x12, InitFromBias2x4
+        dup v8.2s, wzr
+        dup v10.2s, wzr
+        b Compute2x4Enter
+        InitFromBias2x4:
+            ld1 {v8.4h}, [x12]
+            ld1 {v10.4h}, [x12]
+            add x12, x12, #8
+    Compute2x4Enter:
+        bl Compute2x4Unit
+        Activation2x4:
+            cmp x4, #3
+            beq Relu62x4
+            cmp x4, #1
+            beq Relu2x4
+            b Write2x4
+
+            Relu62x4:
+                fmin v8.4h, v8.4h, v7.4h
+                fmin v10.4h, v10.4h, v7.4h
+            Relu2x4:
+                dup v6.4h, wzr
+                fmax v8.4h, v8.4h, v6.4h
+                fmax v10.4h, v10.4h, v6.4h
+            Write2x4:
+                mov x22, x21
+                cmp x13, #1
+                beq Write2x1
+                cmp x13, #2
+                beq Write2x2
+                cmp x13, #3
+                beq Write2x3
+                st1 {v8.4h}, [x22], x8
+                st1 {v10.4h}, [x22]
+                add x21, x21, #8
+                subs x13, x13, #4
+                bgt LoopCol2x4Core
+                b End
+            Write2x1:
+                st1 {v8.h}[0], [x22], x8
+                st1 {v10.h}[0], [x22]
+                b End
+            Write2x2:
+                st1 {v8.s}[0], [x22], x8
+                st1 {v10.s}[0], [x22]
+                b End
+            Write2x3:
+                add x23, x22, #4
+                st1 {v8.s}[0], [x22], x8
+                st1 {v8.h}[2], [x23], x8
+                st1 {v10.s}[0], [x22], x8
+                st1 {v10.h}[2], [x23], x8
+                b End
+
+LoopRow1:
+    mov x11, x1  // reload matrixB
+    mov x12, x3  // reload bias
+    mov x13, x7  // reload col
+    mov x21, x2  // relocate output
+    subs x13, x13, #16
+    blt LoopCol1x8
+    LoopCol1x16:
+        mov x10, x0  // update matrixA
+        mov x14, x5  // reload depth
+        cbnz x12, InitFromBias1x16
+        dup v8.2d, xzr
+        dup v9.2d, xzr
+        b Compute1x16Enter
+        InitFromBias1x16:
+            ld1 {v8.8h, v9.8h}, [x12], #32
+    Compute1x16Enter:
+        bl Compute1x16Unit
+        Activation1x16:
+            cmp x4, #3
+            beq Relu61x16
+            cmp x4, #1
+            beq Relu1x16
+            b Write1x16
+
+            Relu61x16:
+                fmin v8.8h, v8.8h, v7.8h
+                fmin v9.8h, v9.8h, v7.8h
+
+            Relu1x16:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+                fmax v9.8h, v9.8h, v6.8h
+            Write1x16:
+                st1 {v8.8h, v9.8h}, [x21], #32
+                subs x13, x13, #16
+                bge LoopCol1x16
+
+    LoopCol1x8:
+        adds x13, x13, #16
+        cbz x13, End
+        subs x13, x13, #8
+        blt LoopCol1x4
+        mov x10, x0  // update matrixA
+        mov x14, x5  // reload depth
+        cbnz x12, InitFromBias1x8
+        dup v8.2d, xzr
+        b Compute1x8Enter
+        InitFromBias1x8:
+            ld1 {v8.8h}, [x12], #16
+    Compute1x8Enter:
+        bl Compute1x8Unit
+        Activation1x8:
+            cmp x4, #3
+            beq Relu61x8
+            cmp x4, #1
+            beq Relu1x8
+            b Write1x8
+
+            Relu61x8:
+                fmin v8.8h, v8.8h, v7.8h
+
+            Relu1x8:
+                dup v6.8h, wzr
+                fmax v8.8h, v8.8h, v6.8h
+            Write1x8:
+                st1 {v8.8h}, [x21], #16
+                subs x13, x13, #8
+
+    LoopCol1x4:
+        adds x13, x13, #8
+        cbz x13, End
+    LoopCol1x4Core:
+        mov x10, x0  // update matrixA
+        mov x14, x5  // reload depth
+        cbnz x12, InitFromBias1x4
+        dup v8.2s, wzr
+        b Compute1x4Enter
+        InitFromBias1x4:
+            ld1 {v8.4h}, [x12], #8
+    Compute1x4Enter:
+        bl Compute1x4Unit
+        Activation1x4:
+            cmp x4, #3
+            beq Relu61x4
+            cmp x4, #1
+            beq Relu1x4
+            b Write1x4
+
+            Relu61x4:
+                fmin v8.4h, v8.4h, v7.4h
+            Relu1x4:
+                dup v6.4h, wzr
+                fmax v8.4h, v8.4h, v6.4h
+            Write1x4:
+                cmp x13, #1
+                beq Write1x1
+                cmp x13, #2
+                beq Write1x2
+                cmp x13, #3
+                beq Write1x3
+                st1 {v8.4h}, [x21], #8
+                subs x13, x13, #4
+                bgt LoopCol1x4Core
+                b End
+            Write1x1:
+                st1 {v8.h}[0], [x21]
+                b End
+            Write1x2:
+                st1 {v8.s}[0], [x21]
+                b End
+            Write1x3:
+                add x22, x21, #4
+                st1 {v8.s}[0], [x21]
+                st1 {v8.h}[2], [x22]
+                b End
+
+Compute12x16Unit:
+    subs x14, x14, #2
+    ble Compute12x16End
+    Compute12x16:
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.8h, v2.8h}, [x10], #32
+        ld1 {v4.8h, v5.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        fmla v24.8h, v3.8h, v1.h[0]
+        fmla v26.8h, v3.8h, v1.h[1]
+        fmla v28.8h, v3.8h, v1.h[2]
+        fmla v30.8h, v3.8h, v1.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v6.8h}, [x11], #16
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v0.h[1]
+        fmla v13.8h, v4.8h, v0.h[2]
+        fmla v15.8h, v4.8h, v0.h[3]
+        fmla v17.8h, v4.8h, v0.h[4]
+        fmla v19.8h, v4.8h, v0.h[5]
+        fmla v21.8h, v4.8h, v0.h[6]
+        fmla v23.8h, v4.8h, v0.h[7]
+        fmla v25.8h, v4.8h, v1.h[0]
+        fmla v27.8h, v4.8h, v1.h[1]
+        fmla v29.8h, v4.8h, v1.h[2]
+        fmla v31.8h, v4.8h, v1.h[3]
+
+        fmla v8.8h, v5.8h, v1.h[4]
+        fmla v10.8h, v5.8h, v1.h[5]
+        fmla v12.8h, v5.8h, v1.h[6]
+        fmla v14.8h, v5.8h, v1.h[7]
+        fmla v16.8h, v5.8h, v2.h[0]
+        fmla v18.8h, v5.8h, v2.h[1]
+        fmla v20.8h, v5.8h, v2.h[2]
+        fmla v22.8h, v5.8h, v2.h[3]
+        fmla v24.8h, v5.8h, v2.h[4]
+        fmla v26.8h, v5.8h, v2.h[5]
+        fmla v28.8h, v5.8h, v2.h[6]
+        fmla v30.8h, v5.8h, v2.h[7]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v9.8h, v6.8h, v1.h[4]
+        fmla v11.8h, v6.8h, v1.h[5]
+        fmla v13.8h, v6.8h, v1.h[6]
+        fmla v15.8h, v6.8h, v1.h[7]
+        prfm pldl1keep, [x10, #632]
+        ld1 {v0.8h}, [x10], #16
+        fmla v17.8h, v6.8h, v2.h[0]
+        fmla v19.8h, v6.8h, v2.h[1]
+        fmla v21.8h, v6.8h, v2.h[2]
+        fmla v23.8h, v6.8h, v2.h[3]
+        fmla v25.8h, v6.8h, v2.h[4]
+        fmla v27.8h, v6.8h, v2.h[5]
+        fmla v29.8h, v6.8h, v2.h[6]
+        fmla v31.8h, v6.8h, v2.h[7]
+
+        subs x14, x14, #2
+        bgt Compute12x16
+    Compute12x16End:
+        cbnz x14, Compute12x16End1
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.4h}, [x10], #8
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        fmla v24.8h, v3.8h, v1.h[0]
+        fmla v26.8h, v3.8h, v1.h[1]
+        fmla v28.8h, v3.8h, v1.h[2]
+        fmla v30.8h, v3.8h, v1.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v0.h[1]
+        fmla v13.8h, v4.8h, v0.h[2]
+        fmla v15.8h, v4.8h, v0.h[3]
+        ld1 {v2.8h}, [x10], #16
+        fmla v17.8h, v4.8h, v0.h[4]
+        fmla v19.8h, v4.8h, v0.h[5]
+        fmla v21.8h, v4.8h, v0.h[6]
+        fmla v23.8h, v4.8h, v0.h[7]
+        fmla v25.8h, v4.8h, v1.h[0]
+        fmla v27.8h, v4.8h, v1.h[1]
+        fmla v29.8h, v4.8h, v1.h[2]
+        fmla v31.8h, v4.8h, v1.h[3]
+        mov v0.16b, v2.16b
+    Compute12x16End1:
+        ld1 {v1.4h}, [x10]
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        fmla v24.8h, v3.8h, v1.h[0]
+        fmla v26.8h, v3.8h, v1.h[1]
+        fmla v28.8h, v3.8h, v1.h[2]
+        fmla v30.8h, v3.8h, v1.h[3]
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v0.h[1]
+        fmla v13.8h, v4.8h, v0.h[2]
+        fmla v15.8h, v4.8h, v0.h[3]
+        fmla v17.8h, v4.8h, v0.h[4]
+        fmla v19.8h, v4.8h, v0.h[5]
+        fmla v21.8h, v4.8h, v0.h[6]
+        fmla v23.8h, v4.8h, v0.h[7]
+        fmla v25.8h, v4.8h, v1.h[0]
+        fmla v27.8h, v4.8h, v1.h[1]
+        fmla v29.8h, v4.8h, v1.h[2]
+        fmla v31.8h, v4.8h, v1.h[3]
+        ret
+
+Compute12x8Unit:
+    subs x14, x14, #2
+    ble Compute12x8End
+    Compute12x8:
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.8h, v2.8h}, [x10], #32
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        fmla v24.8h, v3.8h, v1.h[0]
+        fmla v26.8h, v3.8h, v1.h[1]
+        fmla v28.8h, v3.8h, v1.h[2]
+        fmla v30.8h, v3.8h, v1.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v8.8h, v4.8h, v1.h[4]
+        fmla v10.8h, v4.8h, v1.h[5]
+        fmla v12.8h, v4.8h, v1.h[6]
+        fmla v14.8h, v4.8h, v1.h[7]
+        ld1 {v0.8h}, [x10], #16
+        fmla v16.8h, v4.8h, v2.h[0]
+        fmla v18.8h, v4.8h, v2.h[1]
+        fmla v20.8h, v4.8h, v2.h[2]
+        fmla v22.8h, v4.8h, v2.h[3]
+        fmla v24.8h, v4.8h, v2.h[4]
+        fmla v26.8h, v4.8h, v2.h[5]
+        fmla v28.8h, v4.8h, v2.h[6]
+        fmla v30.8h, v4.8h, v2.h[7]
+
+        subs x14, x14, #2
+        bgt Compute12x8
+    Compute12x8End:
+        cbnz x14, Compute12x8End1
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.4h}, [x10], #8
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        fmla v24.8h, v3.8h, v1.h[0]
+        fmla v26.8h, v3.8h, v1.h[1]
+        fmla v28.8h, v3.8h, v1.h[2]
+        fmla v30.8h, v3.8h, v1.h[3]
+        ld1 {v0.8h}, [x10], #16
+        mov v3.16b, v4.16b
+    Compute12x8End1:
+        ld1 {v1.4h}, [x10]
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        fmla v24.8h, v3.8h, v1.h[0]
+        fmla v26.8h, v3.8h, v1.h[1]
+        fmla v28.8h, v3.8h, v1.h[2]
+        fmla v30.8h, v3.8h, v1.h[3]
+        ret
+
+Compute12x4Unit:
+    subs x14, x14, #2
+    ble Compute12x4End
+    Compute12x4:
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.8h, v2.8h}, [x10], #32
+        ld1 {v4.4h}, [x11], #8
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v0.h[1]
+        fmla v12.4h, v3.4h, v0.h[2]
+        fmla v14.4h, v3.4h, v0.h[3]
+        fmla v16.4h, v3.4h, v0.h[4]
+        fmla v18.4h, v3.4h, v0.h[5]
+        fmla v20.4h, v3.4h, v0.h[6]
+        fmla v22.4h, v3.4h, v0.h[7]
+        fmla v24.4h, v3.4h, v1.h[0]
+        fmla v26.4h, v3.4h, v1.h[1]
+        fmla v28.4h, v3.4h, v1.h[2]
+        fmla v30.4h, v3.4h, v1.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h}, [x11], #8
+        fmla v8.4h, v4.4h, v1.h[4]
+        fmla v10.4h, v4.4h, v1.h[5]
+        fmla v12.4h, v4.4h, v1.h[6]
+        fmla v14.4h, v4.4h, v1.h[7]
+        ld1 {v0.8h}, [x10], #16
+        fmla v16.4h, v4.4h, v2.h[0]
+        fmla v18.4h, v4.4h, v2.h[1]
+        fmla v20.4h, v4.4h, v2.h[2]
+        fmla v22.4h, v4.4h, v2.h[3]
+        fmla v24.4h, v4.4h, v2.h[4]
+        fmla v26.4h, v4.4h, v2.h[5]
+        fmla v28.4h, v4.4h, v2.h[6]
+        fmla v30.4h, v4.4h, v2.h[7]
+
+        subs x14, x14, #2
+        bgt Compute12x4
+    Compute12x4End:
+        cbnz x14, Compute12x4End1
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.4h}, [x10], #8
+        ld1 {v4.4h}, [x11], #8
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v0.h[1]
+        fmla v12.4h, v3.4h, v0.h[2]
+        fmla v14.4h, v3.4h, v0.h[3]
+        fmla v16.4h, v3.4h, v0.h[4]
+        fmla v18.4h, v3.4h, v0.h[5]
+        fmla v20.4h, v3.4h, v0.h[6]
+        fmla v22.4h, v3.4h, v0.h[7]
+        fmla v24.4h, v3.4h, v1.h[0]
+        fmla v26.4h, v3.4h, v1.h[1]
+        fmla v28.4h, v3.4h, v1.h[2]
+        fmla v30.4h, v3.4h, v1.h[3]
+        ld1 {v0.8h}, [x10], #16
+        mov v3.8b, v4.8b
+    Compute12x4End1:
+        ld1 {v1.4h}, [x10]
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v0.h[1]
+        fmla v12.4h, v3.4h, v0.h[2]
+        fmla v14.4h, v3.4h, v0.h[3]
+        fmla v16.4h, v3.4h, v0.h[4]
+        fmla v18.4h, v3.4h, v0.h[5]
+        fmla v20.4h, v3.4h, v0.h[6]
+        fmla v22.4h, v3.4h, v0.h[7]
+        fmla v24.4h, v3.4h, v1.h[0]
+        fmla v26.4h, v3.4h, v1.h[1]
+        fmla v28.4h, v3.4h, v1.h[2]
+        fmla v30.4h, v3.4h, v1.h[3]
+        ret
+
+Compute8x16Unit:
+    subs x14, x14, #2
+    ble Compute8x16End
+    Compute8x16:
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.8h}, [x10], #16
+        ld1 {v4.8h, v5.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v6.8h}, [x11], #16
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v0.h[1]
+        fmla v13.8h, v4.8h, v0.h[2]
+        fmla v15.8h, v4.8h, v0.h[3]
+        fmla v17.8h, v4.8h, v0.h[4]
+        fmla v19.8h, v4.8h, v0.h[5]
+        fmla v21.8h, v4.8h, v0.h[6]
+        fmla v23.8h, v4.8h, v0.h[7]
+
+        fmla v8.8h, v5.8h, v1.h[0]
+        fmla v10.8h, v5.8h, v1.h[1]
+        fmla v12.8h, v5.8h, v1.h[2]
+        fmla v14.8h, v5.8h, v1.h[3]
+        fmla v16.8h, v5.8h, v1.h[4]
+        fmla v18.8h, v5.8h, v1.h[5]
+        fmla v20.8h, v5.8h, v1.h[6]
+        fmla v22.8h, v5.8h, v1.h[7]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v9.8h, v6.8h, v1.h[0]
+        fmla v11.8h, v6.8h, v1.h[1]
+        fmla v13.8h, v6.8h, v1.h[2]
+        fmla v15.8h, v6.8h, v1.h[3]
+        prfm pldl1keep, [x10, #632]
+        ld1 {v0.8h}, [x10], #16
+        fmla v17.8h, v6.8h, v1.h[4]
+        fmla v19.8h, v6.8h, v1.h[5]
+        fmla v21.8h, v6.8h, v1.h[6]
+        fmla v23.8h, v6.8h, v1.h[7]
+
+        subs x14, x14, #2
+        bgt Compute8x16
+    Compute8x16End:
+        cbnz x14, Compute8x16End1
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.8h}, [x10]
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v0.h[1]
+        fmla v13.8h, v4.8h, v0.h[2]
+        fmla v15.8h, v4.8h, v0.h[3]
+        fmla v17.8h, v4.8h, v0.h[4]
+        fmla v19.8h, v4.8h, v0.h[5]
+        fmla v21.8h, v4.8h, v0.h[6]
+        fmla v23.8h, v4.8h, v0.h[7]
+        mov v0.16b, v1.16b
+    Compute8x16End1:
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v0.h[1]
+        fmla v13.8h, v4.8h, v0.h[2]
+        fmla v15.8h, v4.8h, v0.h[3]
+        fmla v17.8h, v4.8h, v0.h[4]
+        fmla v19.8h, v4.8h, v0.h[5]
+        fmla v21.8h, v4.8h, v0.h[6]
+        fmla v23.8h, v4.8h, v0.h[7]
+        ret
+
+Compute8x8Unit:
+    subs x14, x14, #2
+    ble Compute8x8End
+    Compute8x8:
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.8h}, [x10], #16
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v8.8h, v4.8h, v1.h[0]
+        fmla v10.8h, v4.8h, v1.h[1]
+        fmla v12.8h, v4.8h, v1.h[2]
+        fmla v14.8h, v4.8h, v1.h[3]
+        ld1 {v0.8h}, [x10], #16
+        fmla v16.8h, v4.8h, v1.h[4]
+        fmla v18.8h, v4.8h, v1.h[5]
+        fmla v20.8h, v4.8h, v1.h[6]
+        fmla v22.8h, v4.8h, v1.h[7]
+
+        subs x14, x14, #2
+        bgt Compute8x8
+    Compute8x8End:
+        cbnz x14, Compute8x8End1
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.8h}, [x10]
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        mov v0.16b, v1.16b
+        mov v3.16b, v4.16b
+    Compute8x8End1:
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v16.8h, v3.8h, v0.h[4]
+        fmla v18.8h, v3.8h, v0.h[5]
+        fmla v20.8h, v3.8h, v0.h[6]
+        fmla v22.8h, v3.8h, v0.h[7]
+        ret
+
+Compute8x4Unit:
+    subs x14, x14, #2
+    ble Compute8x4End
+    Compute8x4:
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.8h}, [x10], #16
+        ld1 {v4.4h}, [x11], #8
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v0.h[1]
+        fmla v12.4h, v3.4h, v0.h[2]
+        fmla v14.4h, v3.4h, v0.h[3]
+        fmla v16.4h, v3.4h, v0.h[4]
+        fmla v18.4h, v3.4h, v0.h[5]
+        fmla v20.4h, v3.4h, v0.h[6]
+        fmla v22.4h, v3.4h, v0.h[7]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h}, [x11], #8
+        fmla v8.4h, v4.4h, v1.h[0]
+        fmla v10.4h, v4.4h, v1.h[1]
+        fmla v12.4h, v4.4h, v1.h[2]
+        fmla v14.4h, v4.4h, v1.h[3]
+        ld1 {v0.8h}, [x10], #16
+        fmla v16.4h, v4.4h, v1.h[4]
+        fmla v18.4h, v4.4h, v1.h[5]
+        fmla v20.4h, v4.4h, v1.h[6]
+        fmla v22.4h, v4.4h, v1.h[7]
+
+        subs x14, x14, #2
+        bgt Compute8x4
+    Compute8x4End:
+        cbnz x14, Compute8x4End1
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.8h}, [x10]
+        ld1 {v4.4h}, [x11], #8
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v0.h[1]
+        fmla v12.4h, v3.4h, v0.h[2]
+        fmla v14.4h, v3.4h, v0.h[3]
+        fmla v16.4h, v3.4h, v0.h[4]
+        fmla v18.4h, v3.4h, v0.h[5]
+        fmla v20.4h, v3.4h, v0.h[6]
+        fmla v22.4h, v3.4h, v0.h[7]
+        mov v0.16b, v1.16b
+        mov v3.8b, v4.8b
+    Compute8x4End1:
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v0.h[1]
+        fmla v12.4h, v3.4h, v0.h[2]
+        fmla v14.4h, v3.4h, v0.h[3]
+        fmla v16.4h, v3.4h, v0.h[4]
+        fmla v18.4h, v3.4h, v0.h[5]
+        fmla v20.4h, v3.4h, v0.h[6]
+        fmla v22.4h, v3.4h, v0.h[7]
+        ret
+
+Compute4x16Unit:
+    subs x14, x14, #2
+    ble Compute4x16End
+    Compute4x16:
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.4h}, [x10], #8
+        ld1 {v4.8h, v5.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v6.8h}, [x11], #16
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v0.h[1]
+        fmla v13.8h, v4.8h, v0.h[2]
+        fmla v15.8h, v4.8h, v0.h[3]
+
+        fmla v8.8h, v5.8h, v1.h[0]
+        fmla v10.8h, v5.8h, v1.h[1]
+        fmla v12.8h, v5.8h, v1.h[2]
+        fmla v14.8h, v5.8h, v1.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v9.8h, v6.8h, v1.h[0]
+        fmla v11.8h, v6.8h, v1.h[1]
+        fmla v13.8h, v6.8h, v1.h[2]
+        fmla v15.8h, v6.8h, v1.h[3]
+        ld1 {v0.4h}, [x10], #8
+
+        subs x14, x14, #2
+        bgt Compute4x16
+    Compute4x16End:
+        cbnz x14, Compute4x16End1
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.4h}, [x10]
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v0.h[1]
+        fmla v13.8h, v4.8h, v0.h[2]
+        fmla v15.8h, v4.8h, v0.h[3]
+        mov v0.8b, v1.8b
+    Compute4x16End1:
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v0.h[1]
+        fmla v13.8h, v4.8h, v0.h[2]
+        fmla v15.8h, v4.8h, v0.h[3]
+        ret
+
+Compute4x8Unit:
+    subs x14, x14, #2
+    ble Compute4x8End
+    Compute4x8:
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.4h}, [x10], #8
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v8.8h, v4.8h, v1.h[0]
+        fmla v10.8h, v4.8h, v1.h[1]
+        fmla v12.8h, v4.8h, v1.h[2]
+        fmla v14.8h, v4.8h, v1.h[3]
+        ld1 {v0.4h}, [x10], #8
+
+        subs x14, x14, #2
+        bgt Compute4x8
+    Compute4x8End:
+        cbnz x14, Compute4x8End1
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.4h}, [x10]
+        ld1 {v4.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        mov v0.8b, v1.8b
+        mov v3.16b, v4.16b
+    Compute4x8End1:
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v0.h[1]
+        fmla v12.8h, v3.8h, v0.h[2]
+        fmla v14.8h, v3.8h, v0.h[3]
+        ret
+
+Compute4x4Unit:
+    subs x14, x14, #2
+    ble Compute4x4End
+    Compute4x4:
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.4h}, [x10], #8
+        ld1 {v4.4h}, [x11], #8
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v0.h[1]
+        fmla v12.4h, v3.4h, v0.h[2]
+        fmla v14.4h, v3.4h, v0.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h}, [x11], #8
+        fmla v8.4h, v4.4h, v1.h[0]
+        fmla v10.4h, v4.4h, v1.h[1]
+        fmla v12.4h, v4.4h, v1.h[2]
+        fmla v14.4h, v4.4h, v1.h[3]
+        ld1 {v0.4h}, [x10], #8
+
+        subs x14, x14, #2
+        bgt Compute4x4
+    Compute4x4End:
+        cbnz x14, Compute4x4End1
+        prfm pldl1keep, [x10, #632]
+        ld1 {v1.4h}, [x10]
+        ld1 {v4.4h}, [x11], #8
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v0.h[1]
+        fmla v12.4h, v3.4h, v0.h[2]
+        fmla v14.4h, v3.4h, v0.h[3]
+        mov v0.8b, v1.8b
+        mov v3.8b, v4.8b
+    Compute4x4End1:
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v0.h[1]
+        fmla v12.4h, v3.4h, v0.h[2]
+        fmla v14.4h, v3.4h, v0.h[3]
+        ret
+
+Compute3x16Unit:
+    add x19, x10, x16
+    add x20, x10, x16, lsl #1
+    subs x14, x14, #8
+    blt Compute3x16End4
+    Compute3x16:
+        ld1 {v0.8h}, [x10], #16
+        ld1 {v1.8h}, [x19], #16
+        ld1 {v2.8h}, [x20], #16
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v12.8h, v3.8h, v2.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+        fmla v13.8h, v4.8h, v2.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        fmla v10.8h, v5.8h, v1.h[1]
+        fmla v12.8h, v5.8h, v2.h[1]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v11.8h, v6.8h, v1.h[1]
+        fmla v13.8h, v6.8h, v2.h[1]
+        fmla v8.8h, v3.8h, v0.h[2]
+        fmla v10.8h, v3.8h, v1.h[2]
+        fmla v12.8h, v3.8h, v2.h[2]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[2]
+        fmla v11.8h, v4.8h, v1.h[2]
+        fmla v13.8h, v4.8h, v2.h[2]
+        fmla v8.8h, v5.8h, v0.h[3]
+        fmla v10.8h, v5.8h, v1.h[3]
+        fmla v12.8h, v5.8h, v2.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[3]
+        fmla v11.8h, v6.8h, v1.h[3]
+        fmla v13.8h, v6.8h, v2.h[3]
+
+        fmla v8.8h, v3.8h, v0.h[4]
+        fmla v10.8h, v3.8h, v1.h[4]
+        fmla v12.8h, v3.8h, v2.h[4]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[4]
+        fmla v11.8h, v4.8h, v1.h[4]
+        fmla v13.8h, v4.8h, v2.h[4]
+        fmla v8.8h, v5.8h, v0.h[5]
+        fmla v10.8h, v5.8h, v1.h[5]
+        fmla v12.8h, v5.8h, v2.h[5]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[5]
+        fmla v11.8h, v6.8h, v1.h[5]
+        fmla v13.8h, v6.8h, v2.h[5]
+        fmla v8.8h, v3.8h, v0.h[6]
+        fmla v10.8h, v3.8h, v1.h[6]
+        fmla v12.8h, v3.8h, v2.h[6]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[6]
+        fmla v11.8h, v4.8h, v1.h[6]
+        fmla v13.8h, v4.8h, v2.h[6]
+        fmla v8.8h, v5.8h, v0.h[7]
+        fmla v10.8h, v5.8h, v1.h[7]
+        fmla v12.8h, v5.8h, v2.h[7]
+        fmla v9.8h, v6.8h, v0.h[7]
+        fmla v11.8h, v6.8h, v1.h[7]
+        fmla v13.8h, v6.8h, v2.h[7]
+
+        subs x14, x14, #8
+        bge Compute3x16
+    Compute3x16End4:
+        adds x14, x14, #8
+        cbz x14, Compute3x16Return
+        subs x14, x14, #4
+        blt Compute3x16EndTail
+        ld1 {v0.4h}, [x10], #8
+        ld1 {v1.4h}, [x19], #8
+        ld1 {v2.4h}, [x20], #8
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v12.8h, v3.8h, v2.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+        fmla v13.8h, v4.8h, v2.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        fmla v10.8h, v5.8h, v1.h[1]
+        fmla v12.8h, v5.8h, v2.h[1]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v11.8h, v6.8h, v1.h[1]
+        fmla v13.8h, v6.8h, v2.h[1]
+        fmla v8.8h, v3.8h, v0.h[2]
+        fmla v10.8h, v3.8h, v1.h[2]
+        fmla v12.8h, v3.8h, v2.h[2]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[2]
+        fmla v11.8h, v4.8h, v1.h[2]
+        fmla v13.8h, v4.8h, v2.h[2]
+        fmla v8.8h, v5.8h, v0.h[3]
+        fmla v10.8h, v5.8h, v1.h[3]
+        fmla v12.8h, v5.8h, v2.h[3]
+        fmla v9.8h, v6.8h, v0.h[3]
+        fmla v11.8h, v6.8h, v1.h[3]
+        fmla v13.8h, v6.8h, v2.h[3]
+        subs x14, x14, #4
+    Compute3x16EndTail:
+        adds x14, x14, #4
+        cbz x14, Compute3x16Return
+        cmp x14, #1
+        beq Compute3x16EndTail1
+        cmp x14, #2
+        beq Compute3x16EndTail2
+        ld1 {v0.4h}, [x10]
+        ld1 {v1.4h}, [x19]
+        ld1 {v2.s}[0], [x20], #4
+        ld1 {v2.h}[2], [x20]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v12.8h, v3.8h, v2.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+        fmla v13.8h, v4.8h, v2.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        fmla v10.8h, v5.8h, v1.h[1]
+        fmla v12.8h, v5.8h, v2.h[1]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v11.8h, v6.8h, v1.h[1]
+        fmla v13.8h, v6.8h, v2.h[1]
+        fmla v8.8h, v3.8h, v0.h[2]
+        fmla v10.8h, v3.8h, v1.h[2]
+        fmla v12.8h, v3.8h, v2.h[2]
+        fmla v9.8h, v4.8h, v0.h[2]
+        fmla v11.8h, v4.8h, v1.h[2]
+        fmla v13.8h, v4.8h, v2.h[2]
+        b Compute3x16Return
+    Compute3x16EndTail2:
+        ld1 {v0.4h}, [x10]
+        ld1 {v1.4h}, [x19]
+        ld1 {v2.s}[0], [x20]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v12.8h, v3.8h, v2.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+        fmla v13.8h, v4.8h, v2.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        fmla v10.8h, v5.8h, v1.h[1]
+        fmla v12.8h, v5.8h, v2.h[1]
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v11.8h, v6.8h, v1.h[1]
+        fmla v13.8h, v6.8h, v2.h[1]
+        b Compute3x16Return
+    Compute3x16EndTail1:
+        ld1 {v0.h}[0], [x10]
+        ld1 {v1.h}[0], [x19]
+        ld1 {v2.h}[0], [x20]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v12.8h, v3.8h, v2.h[0]
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+        fmla v13.8h, v4.8h, v2.h[0]
+    Compute3x16Return:
+        ret
+
+Compute3x8Unit:
+    add x19, x10, x16
+    add x20, x10, x16, lsl #1
+    subs x14, x14, #8
+    blt Compute3x8End4
+    Compute3x8:
+        ld1 {v0.8h}, [x10], #16
+        ld1 {v1.8h}, [x19], #16
+        ld1 {v2.8h}, [x20], #16
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v12.8h, v3.8h, v2.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[1]
+        fmla v10.8h, v4.8h, v1.h[1]
+        fmla v12.8h, v4.8h, v2.h[1]
+        fmla v8.8h, v5.8h, v0.h[2]
+        fmla v10.8h, v5.8h, v1.h[2]
+        fmla v12.8h, v5.8h, v2.h[2]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v6.8h, v0.h[3]
+        fmla v10.8h, v6.8h, v1.h[3]
+        fmla v12.8h, v6.8h, v2.h[3]
+        fmla v8.8h, v3.8h, v0.h[4]
+        fmla v10.8h, v3.8h, v1.h[4]
+        fmla v12.8h, v3.8h, v2.h[4]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[5]
+        fmla v10.8h, v4.8h, v1.h[5]
+        fmla v12.8h, v4.8h, v2.h[5]
+        fmla v8.8h, v5.8h, v0.h[6]
+        fmla v10.8h, v5.8h, v1.h[6]
+        fmla v12.8h, v5.8h, v2.h[6]
+        fmla v8.8h, v6.8h, v0.h[7]
+        fmla v10.8h, v6.8h, v1.h[7]
+        fmla v12.8h, v6.8h, v2.h[7]
+
+        subs x14, x14, #8
+        bge Compute3x8
+    Compute3x8End4:
+        adds x14, x14, #8
+        cbz x14, Compute3x8Return
+        subs x14, x14, #4
+        blt Compute3x8EndTail
+        ld1 {v0.4h}, [x10], #8
+        ld1 {v1.4h}, [x19], #8
+        ld1 {v2.4h}, [x20], #8
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v12.8h, v3.8h, v2.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[1]
+        fmla v10.8h, v4.8h, v1.h[1]
+        fmla v12.8h, v4.8h, v2.h[1]
+        fmla v8.8h, v5.8h, v0.h[2]
+        fmla v10.8h, v5.8h, v1.h[2]
+        fmla v12.8h, v5.8h, v2.h[2]
+        fmla v8.8h, v6.8h, v0.h[3]
+        fmla v10.8h, v6.8h, v1.h[3]
+        fmla v12.8h, v6.8h, v2.h[3]
+        subs x14, x14, #4
+    Compute3x8EndTail:
+        adds x14, x14, #4
+        cbz x14, Compute3x8Return
+        cmp x14, #1
+        beq Compute3x8EndTail1
+        cmp x14, #2
+        beq Compute3x8EndTail2
+        ld1 {v0.4h}, [x10]
+        ld1 {v1.4h}, [x19]
+        ld1 {v2.s}[0], [x20], #4
+        ld1 {v2.h}[2], [x20]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v12.8h, v3.8h, v2.h[0]
+        ld1 {v5.8h}, [x11], #16
+        fmla v8.8h, v4.8h, v0.h[1]
+        fmla v10.8h, v4.8h, v1.h[1]
+        fmla v12.8h, v4.8h, v2.h[1]
+        fmla v8.8h, v5.8h, v0.h[2]
+        fmla v10.8h, v5.8h, v1.h[2]
+        fmla v12.8h, v5.8h, v2.h[2]
+        b Compute3x8Return
+    Compute3x8EndTail2:
+        ld1 {v0.4h}, [x10]
+        ld1 {v1.4h}, [x19]
+        ld2 {v2.h, v3.h}[0], [x20]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v5.8h, v0.h[0]
+        fmla v10.8h, v5.8h, v1.h[0]
+        fmla v12.8h, v5.8h, v2.h[0]
+        fmla v8.8h, v6.8h, v0.h[1]
+        fmla v10.8h, v6.8h, v1.h[1]
+        fmla v12.8h, v6.8h, v3.h[0]
+        b Compute3x8Return
+    Compute3x8EndTail1:
+        ld1 {v0.h}[0], [x10]
+        ld1 {v1.h}[0], [x19]
+        ld1 {v2.h}[0], [x20]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v12.8h, v3.8h, v2.h[0]
+    Compute3x8Return:
+        ret
+
+Compute3x4Unit:
+    add x19, x10, x16
+    add x20, x10, x16, lsl #1
+    subs x14, x14, #8
+    blt Compute3x4End4
+    Compute3x4:
+        ld1 {v0.8h}, [x10], #16
+        ld1 {v1.8h}, [x19], #16
+        ld1 {v2.8h}, [x20], #16
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v1.h[0]
+        fmla v12.4h, v3.4h, v2.h[0]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[1]
+        fmla v10.4h, v4.4h, v1.h[1]
+        fmla v12.4h, v4.4h, v2.h[1]
+        fmla v8.4h, v5.4h, v0.h[2]
+        fmla v10.4h, v5.4h, v1.h[2]
+        fmla v12.4h, v5.4h, v2.h[2]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v6.4h, v0.h[3]
+        fmla v10.4h, v6.4h, v1.h[3]
+        fmla v12.4h, v6.4h, v2.h[3]
+        fmla v8.4h, v3.4h, v0.h[4]
+        fmla v10.4h, v3.4h, v1.h[4]
+        fmla v12.4h, v3.4h, v2.h[4]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[5]
+        fmla v10.4h, v4.4h, v1.h[5]
+        fmla v12.4h, v4.4h, v2.h[5]
+        fmla v8.4h, v5.4h, v0.h[6]
+        fmla v10.4h, v5.4h, v1.h[6]
+        fmla v12.4h, v5.4h, v2.h[6]
+        fmla v8.4h, v6.4h, v0.h[7]
+        fmla v10.4h, v6.4h, v1.h[7]
+        fmla v12.4h, v6.4h, v2.h[7]
+
+        subs x14, x14, #8
+        bge Compute3x4
+    Compute3x4End4:
+        adds x14, x14, #8
+        cbz x14, Compute3x4Return
+        subs x14, x14, #4
+        blt Compute3x4EndTail
+        ld1 {v0.4h}, [x10], #8
+        ld1 {v1.4h}, [x19], #8
+        ld1 {v2.4h}, [x20], #8
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v1.h[0]
+        fmla v12.4h, v3.4h, v2.h[0]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[1]
+        fmla v10.4h, v4.4h, v1.h[1]
+        fmla v12.4h, v4.4h, v2.h[1]
+        fmla v8.4h, v5.4h, v0.h[2]
+        fmla v10.4h, v5.4h, v1.h[2]
+        fmla v12.4h, v5.4h, v2.h[2]
+        fmla v8.4h, v6.4h, v0.h[3]
+        fmla v10.4h, v6.4h, v1.h[3]
+        fmla v12.4h, v6.4h, v2.h[3]
+        subs x14, x14, #4
+    Compute3x4EndTail:
+        adds x14, x14, #4
+        cbz x14, Compute3x4Return
+        cmp x14, #1
+        beq Compute3x4EndTail1
+        cmp x14, #2
+        beq Compute3x4EndTail2
+        ld1 {v0.4h}, [x10]
+        ld1 {v1.4h}, [x19]
+        ld1 {v2.s}[0], [x20], #4
+        ld1 {v2.h}[2], [x20]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v1.h[0]
+        fmla v12.4h, v3.4h, v2.h[0]
+        ld1 {v5.4h}, [x11], #8
+        fmla v8.4h, v4.4h, v0.h[1]
+        fmla v10.4h, v4.4h, v1.h[1]
+        fmla v12.4h, v4.4h, v2.h[1]
+        fmla v8.4h, v5.4h, v0.h[2]
+        fmla v10.4h, v5.4h, v1.h[2]
+        fmla v12.4h, v5.4h, v2.h[2]
+        b Compute3x4Return
+    Compute3x4EndTail2:
+        ld1 {v0.4h}, [x10]
+        ld1 {v1.4h}, [x19]
+        ld2 {v2.h, v3.h}[0], [x20]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v5.4h, v0.h[0]
+        fmla v10.4h, v5.4h, v1.h[0]
+        fmla v12.4h, v5.4h, v2.h[0]
+        fmla v8.4h, v6.4h, v0.h[1]
+        fmla v10.4h, v6.4h, v1.h[1]
+        fmla v12.4h, v6.4h, v3.h[0]
+        b Compute3x4Return
+    Compute3x4EndTail1:
+        ld1 {v0.h}[0], [x10]
+        ld1 {v1.h}[0], [x19]
+        ld1 {v2.h}[0], [x20]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h}, [x11], #8
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v1.h[0]
+        fmla v12.4h, v3.4h, v2.h[0]
+    Compute3x4Return:
+        ret
+
+Compute2x16Unit:
+    add x19, x10, x16
+    subs x14, x14, #8
+    blt Compute2x16End4
+    Compute2x16:
+        ld1 {v0.8h}, [x10], #16
+        ld1 {v1.8h}, [x19], #16
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        fmla v10.8h, v5.8h, v1.h[1]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v11.8h, v6.8h, v1.h[1]
+        fmla v8.8h, v3.8h, v0.h[2]
+        fmla v10.8h, v3.8h, v1.h[2]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[2]
+        fmla v11.8h, v4.8h, v1.h[2]
+        fmla v8.8h, v5.8h, v0.h[3]
+        fmla v10.8h, v5.8h, v1.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[3]
+        fmla v11.8h, v6.8h, v1.h[3]
+
+        fmla v8.8h, v3.8h, v0.h[4]
+        fmla v10.8h, v3.8h, v1.h[4]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[4]
+        fmla v11.8h, v4.8h, v1.h[4]
+        fmla v8.8h, v5.8h, v0.h[5]
+        fmla v10.8h, v5.8h, v1.h[5]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[5]
+        fmla v11.8h, v6.8h, v1.h[5]
+        fmla v8.8h, v3.8h, v0.h[6]
+        fmla v10.8h, v3.8h, v1.h[6]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[6]
+        fmla v11.8h, v4.8h, v1.h[6]
+        fmla v8.8h, v5.8h, v0.h[7]
+        fmla v10.8h, v5.8h, v1.h[7]
+        fmla v9.8h, v6.8h, v0.h[7]
+        fmla v11.8h, v6.8h, v1.h[7]
+
+        subs x14, x14, #8
+        bge Compute2x16
+    Compute2x16End4:
+        adds x14, x14, #8
+        cbz x14, Compute2x16Return
+        subs x14, x14, #4
+        blt Compute2x16EndTail
+        ld1 {v0.4h}, [x10], #8
+        ld1 {v1.4h}, [x19], #8
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        fmla v10.8h, v5.8h, v1.h[1]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v11.8h, v6.8h, v1.h[1]
+        fmla v8.8h, v3.8h, v0.h[2]
+        fmla v10.8h, v3.8h, v1.h[2]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[2]
+        fmla v11.8h, v4.8h, v1.h[2]
+        fmla v8.8h, v5.8h, v0.h[3]
+        fmla v10.8h, v5.8h, v1.h[3]
+        fmla v9.8h, v6.8h, v0.h[3]
+        fmla v11.8h, v6.8h, v1.h[3]
+        subs x14, x14, #4
+    Compute2x16EndTail:
+        adds x14, x14, #4
+        cbz x14, Compute2x16Return
+        cmp x14, #1
+        beq Compute2x16EndTail1
+        cmp x14, #2
+        beq Compute2x16EndTail2
+        ld1 {v0.4h}, [x10]
+        ld1 {v1.s}[0], [x19], #4
+        ld1 {v1.h}[2], [x19]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        fmla v10.8h, v5.8h, v1.h[1]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v11.8h, v6.8h, v1.h[1]
+        fmla v8.8h, v3.8h, v0.h[2]
+        fmla v10.8h, v3.8h, v1.h[2]
+        fmla v9.8h, v4.8h, v0.h[2]
+        fmla v11.8h, v4.8h, v1.h[2]
+        b Compute2x16Return
+    Compute2x16EndTail2:
+        ld1 {v0.4h}, [x10]
+        ld2 {v1.h, v2.h}[0], [x19]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        fmla v10.8h, v5.8h, v2.h[0]
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v11.8h, v6.8h, v2.h[0]
+        b Compute2x16Return
+    Compute2x16EndTail1:
+        ld1 {v0.h}[0], [x10]
+        ld1 {v1.h}[0], [x19]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v11.8h, v4.8h, v1.h[0]
+    Compute2x16Return:
+        ret
+
+Compute2x8Unit:
+    add x19, x10, x16
+    subs x14, x14, #8
+    blt Compute2x8End4
+    Compute2x8:
+        ld1 {v0.8h}, [x10], #16
+        ld1 {v1.8h}, [x19], #16
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[1]
+        fmla v10.8h, v4.8h, v1.h[1]
+        fmla v8.8h, v5.8h, v0.h[2]
+        fmla v10.8h, v5.8h, v1.h[2]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v6.8h, v0.h[3]
+        fmla v10.8h, v6.8h, v1.h[3]
+        fmla v8.8h, v3.8h, v0.h[4]
+        fmla v10.8h, v3.8h, v1.h[4]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[5]
+        fmla v10.8h, v4.8h, v1.h[5]
+        fmla v8.8h, v5.8h, v0.h[6]
+        fmla v10.8h, v5.8h, v1.h[6]
+        fmla v8.8h, v6.8h, v0.h[7]
+        fmla v10.8h, v6.8h, v1.h[7]
+
+        subs x14, x14, #8
+        bge Compute2x8
+    Compute2x8End4:
+        adds x14, x14, #8
+        cbz x14, Compute2x8Return
+        subs x14, x14, #4
+        blt Compute2x8EndTail
+        ld1 {v0.4h}, [x10], #8
+        ld1 {v1.4h}, [x19], #8
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[1]
+        fmla v10.8h, v4.8h, v1.h[1]
+        fmla v8.8h, v5.8h, v0.h[2]
+        fmla v10.8h, v5.8h, v1.h[2]
+        fmla v8.8h, v6.8h, v0.h[3]
+        fmla v10.8h, v6.8h, v1.h[3]
+        subs x14, x14, #4
+    Compute2x8EndTail:
+        adds x14, x14, #4
+        cbz x14, Compute2x8Return
+        cmp x14, #1
+        beq Compute2x8EndTail1
+        cmp x14, #2
+        beq Compute2x8EndTail2
+        ld1 {v0.4h}, [x10]
+        ld3 {v1.h, v2.h, v3.h}[0], [x19]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v4.8h, v5.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[0]
+        fmla v10.8h, v4.8h, v1.h[0]
+        ld1 {v6.8h}, [x11], #16
+        fmla v8.8h, v5.8h, v0.h[1]
+        fmla v10.8h, v5.8h, v2.h[0]
+        fmla v8.8h, v6.8h, v0.h[2]
+        fmla v10.8h, v6.8h, v3.h[0]
+        b Compute2x8Return
+    Compute2x8EndTail2:
+        ld1 {v0.4h}, [x10]
+        ld2 {v1.h, v2.h}[0], [x19]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+        fmla v8.8h, v4.8h, v0.h[1]
+        fmla v10.8h, v4.8h, v2.h[0]
+        b Compute2x8Return
+    Compute2x8EndTail1:
+        ld1 {v0.h}[0], [x10]
+        ld1 {v1.h}[0], [x19]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v10.8h, v3.8h, v1.h[0]
+    Compute2x8Return:
+        ret
+
+Compute2x4Unit:
+    add x19, x10, x16
+    subs x14, x14, #8
+    blt Compute2x4End4
+    Compute2x4:
+        ld1 {v0.8h}, [x10], #16
+        ld1 {v1.8h}, [x19], #16
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v1.h[0]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[1]
+        fmla v10.4h, v4.4h, v1.h[1]
+        fmla v8.4h, v5.4h, v0.h[2]
+        fmla v10.4h, v5.4h, v1.h[2]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v6.4h, v0.h[3]
+        fmla v10.4h, v6.4h, v1.h[3]
+        fmla v8.4h, v3.4h, v0.h[4]
+        fmla v10.4h, v3.4h, v1.h[4]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[5]
+        fmla v10.4h, v4.4h, v1.h[5]
+        fmla v8.4h, v5.4h, v0.h[6]
+        fmla v10.4h, v5.4h, v1.h[6]
+        fmla v8.4h, v6.4h, v0.h[7]
+        fmla v10.4h, v6.4h, v1.h[7]
+
+        subs x14, x14, #8
+        bge Compute2x4
+    Compute2x4End4:
+        adds x14, x14, #8
+        cbz x14, Compute2x4Return
+        subs x14, x14, #4
+        blt Compute2x4EndTail
+        ld1 {v0.4h}, [x10], #8
+        ld1 {v1.4h}, [x19], #8
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v1.h[0]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[1]
+        fmla v10.4h, v4.4h, v1.h[1]
+        fmla v8.4h, v5.4h, v0.h[2]
+        fmla v10.4h, v5.4h, v1.h[2]
+        fmla v8.4h, v6.4h, v0.h[3]
+        fmla v10.4h, v6.4h, v1.h[3]
+        subs x14, x14, #4
+    Compute2x4EndTail:
+        adds x14, x14, #4
+        cbz x14, Compute2x4Return
+        cmp x14, #1
+        beq Compute2x4EndTail1
+        cmp x14, #2
+        beq Compute2x4EndTail2
+        ld1 {v0.4h}, [x10]
+        ld3 {v1.h, v2.h, v3.h}[0], [x19]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v4.4h, v5.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[0]
+        fmla v10.4h, v4.4h, v1.h[0]
+        ld1 {v6.4h}, [x11], #8
+        fmla v8.4h, v5.4h, v0.h[1]
+        fmla v10.4h, v5.4h, v2.h[0]
+        fmla v8.4h, v6.4h, v0.h[2]
+        fmla v10.4h, v6.4h, v3.h[0]
+        b Compute2x4Return
+    Compute2x4EndTail2:
+        ld1 {v0.4h}, [x10]
+        ld2 {v1.h, v2.h}[0], [x19]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v1.h[0]
+        fmla v8.4h, v4.4h, v0.h[1]
+        fmla v10.4h, v4.4h, v2.h[0]
+        b Compute2x4Return
+    Compute2x4EndTail1:
+        ld1 {v0.h}[0], [x10]
+        ld1 {v1.h}[0], [x19]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h}, [x11], #8
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v10.4h, v3.4h, v1.h[0]
+    Compute2x4Return:
+        ret
+
+Compute1x16Unit:
+    subs x14, x14, #8
+    blt Compute1x16End4
+    Compute1x16:
+        ld1 {v0.8h}, [x10], #16
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v8.8h, v3.8h, v0.h[2]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[2]
+        fmla v8.8h, v5.8h, v0.h[3]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[3]
+
+        fmla v8.8h, v3.8h, v0.h[4]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[4]
+        fmla v8.8h, v5.8h, v0.h[5]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[5]
+        fmla v8.8h, v3.8h, v0.h[6]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[6]
+        fmla v8.8h, v5.8h, v0.h[7]
+        fmla v9.8h, v6.8h, v0.h[7]
+
+        subs x14, x14, #8
+        bge Compute1x16
+    Compute1x16End4:
+        adds x14, x14, #8
+        cbz x14, Compute1x16Return
+        subs x14, x14, #4
+        blt Compute1x16EndTail
+        ld1 {v0.4h}, [x10], #8
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v8.8h, v5.8h, v0.h[1]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v0.h[1]
+        fmla v8.8h, v3.8h, v0.h[2]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[2]
+        fmla v8.8h, v5.8h, v0.h[3]
+        fmla v9.8h, v6.8h, v0.h[3]
+        subs x14, x14, #4
+    Compute1x16EndTail:
+        adds x14, x14, #4
+        cbz x14, Compute1x16Return
+        cmp x14, #1
+        beq Compute1x16EndTail1
+        cmp x14, #2
+        beq Compute1x16EndTail2
+        ld3 {v0.h, v1.h, v2.h}[0], [x10]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v8.8h, v5.8h, v1.h[0]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v9.8h, v6.8h, v1.h[0]
+        fmla v8.8h, v3.8h, v2.h[0]
+        fmla v9.8h, v4.8h, v2.h[0]
+        b Compute1x16Return
+    Compute1x16EndTail2:
+        ld2 {v0.h, v1.h}[0], [x10]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v9.8h, v4.8h, v0.h[0]
+        fmla v8.8h, v5.8h, v1.h[0]
+        fmla v9.8h, v6.8h, v1.h[0]
+        b Compute1x16Return
+    Compute1x16EndTail1:
+        ld1 {v0.h}[0], [x10]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v9.8h, v4.8h, v0.h[0]
+    Compute1x16Return:
+        ret
+
+Compute1x8Unit:
+    subs x14, x14, #8
+    blt Compute1x8End4
+    Compute1x8:
+        ld1 {v0.8h}, [x10], #16
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[1]
+        fmla v8.8h, v5.8h, v0.h[2]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v6.8h, v0.h[3]
+        fmla v8.8h, v3.8h, v0.h[4]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[5]
+        fmla v8.8h, v5.8h, v0.h[6]
+        fmla v8.8h, v6.8h, v0.h[7]
+
+        subs x14, x14, #8
+        bge Compute1x8
+    Compute1x8End4:
+        adds x14, x14, #8
+        cbz x14, Compute1x8Return
+        subs x14, x14, #4
+        blt Compute1x8EndTail
+        ld1 {v0.4h}, [x10], #8
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        ld1 {v5.8h, v6.8h}, [x11], #32
+        fmla v8.8h, v4.8h, v0.h[1]
+        fmla v8.8h, v5.8h, v0.h[2]
+        fmla v8.8h, v6.8h, v0.h[3]
+        subs x14, x14, #4
+    Compute1x8EndTail:
+        adds x14, x14, #4
+        cbz x14, Compute1x8Return
+        cmp x14, #1
+        beq Compute1x8EndTail1
+        cmp x14, #2
+        beq Compute1x8EndTail2
+        ld3 {v0.h, v1.h, v2.h}[0], [x10]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        ld1 {v5.8h}, [x11], #16
+        fmla v8.8h, v4.8h, v1.h[0]
+        fmla v8.8h, v5.8h, v2.h[0]
+        b Compute1x8Return
+    Compute1x8EndTail2:
+        ld2 {v0.h, v1.h}[0], [x10]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h, v4.8h}, [x11], #32
+        fmla v8.8h, v3.8h, v0.h[0]
+        fmla v8.8h, v4.8h, v1.h[0]
+        b Compute1x8Return
+    Compute1x8EndTail1:
+        ld1 {v0.h}[0], [x10]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.8h}, [x11], #16
+        fmla v8.8h, v3.8h, v0.h[0]
+    Compute1x8Return:
+        ret
+
+Compute1x4Unit:
+    subs x14, x14, #8
+    blt Compute1x4End4
+    Compute1x4:
+        ld1 {v0.8h}, [x10], #16
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[1]
+        fmla v8.4h, v5.4h, v0.h[2]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v6.4h, v0.h[3]
+        fmla v8.4h, v3.4h, v0.h[4]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[5]
+        fmla v8.4h, v5.4h, v0.h[6]
+        fmla v8.4h, v6.4h, v0.h[7]
+
+        subs x14, x14, #8
+        bge Compute1x4
+    Compute1x4End4:
+        adds x14, x14, #8
+        cbz x14, Compute1x4Return
+        subs x14, x14, #4
+        blt Compute1x4EndTail
+        ld1 {v0.4h}, [x10], #8
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        ld1 {v5.4h, v6.4h}, [x11], #16
+        fmla v8.4h, v4.4h, v0.h[1]
+        fmla v8.4h, v5.4h, v0.h[2]
+        fmla v8.4h, v6.4h, v0.h[3]
+        subs x14, x14, #4
+    Compute1x4EndTail:
+        adds x14, x14, #4
+        cbz x14, Compute1x4Return
+        cmp x14, #1
+        beq Compute1x4EndTail1
+        cmp x14, #2
+        beq Compute1x4EndTail2
+        ld3 {v0.h, v1.h, v2.h}[0], [x10]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        ld1 {v5.4h}, [x11], #8
+        fmla v8.4h, v4.4h, v1.h[0]
+        fmla v8.4h, v5.4h, v2.h[0]
+        b Compute1x4Return
+    Compute1x4EndTail2:
+        ld2 {v0.h, v1.h}[0], [x10]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h, v4.4h}, [x11], #16
+        fmla v8.4h, v3.4h, v0.h[0]
+        fmla v8.4h, v4.4h, v1.h[0]
+        b Compute1x4Return
+    Compute1x4EndTail1:
+        ld1 {v0.h}[0], [x10]
+        prfm pldl1strm, [x11, #632]
+        ld1 {v3.4h}, [x11], #8
+        fmla v8.4h, v3.4h, v0.h[0]
+    Compute1x4Return:
+        ret
+
+End:
+  sub sp, sp, #192
+  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
+  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
+  ldp x19, x20, [sp], #16
+  ldp x21, x22, [sp], #16
+  ldp x23, x24, [sp], #16
+  ldp x29, x30, [sp], #16
+  ret
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/custom_gather_d_grad_v2_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/custom_gather_d_grad_v2_parameter.h
new file mode 100644
index 00000000..541c7ff1
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/custom_gather_d_grad_v2_parameter.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_PARAMETER_H_
+#define MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_PARAMETER_H_
+
+#include "nnacl/op_base.h"
+
+typedef struct CustomGatherGradV2Parameter {
+  // Primitive parameter
+  OpParameter op_parameter_;
+  // shape correlative
+  int dim;
+} CustomGatherGradV2Parameter;
+
+#endif  // MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_PARAMETER_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/custom_gru_fp16.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/custom_gru_fp16.c
index 6e754569..72391811 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/custom_gru_fp16.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/custom_gru_fp16.c
@@ -35,13 +35,13 @@ void CustomGruFp16(float16_t *output, const float16_t *input, const float16_t *w
   float16_t *hidden_gate = buffer[C3NUM];
   for (int i = 0; i < num_step; ++i) {
     if (batch_size != 1) {
-      RowMajor2ColNMajorFp16(input + i * batch_size * input_size, buffer[0], batch_size, input_size);
+      RowMajor2ColNMajorFp16(input + i * batch_size * input_size, buffer[0], batch_size, input_size, false);
       for (int j = 0; j < C3NUM; ++j) {
         MatmulBaseFp16Neon(buffer[0], weight_input + j * weight_in_offset, input_gate + j * output_size,
                            bias_input + j * col_align, ActType_No, input_size, batch_size, hidden_size, hidden_size,
                            OutType_Nhwc);
       }
-      RowMajor2ColNMajorFp16(init_h, buffer[C2NUM], batch_size, hidden_size);
+      RowMajor2ColNMajorFp16(init_h, buffer[C2NUM], batch_size, hidden_size, false);
       for (int j = 0; j < C3NUM; ++j) {
         MatmulBaseFp16Neon(buffer[C2NUM], weight_hidden + j * weight_hidden_offset, hidden_gate + j * output_size,
                            bias_hidden + j * col_align, ActType_No, hidden_size, batch_size, hidden_size, hidden_size,
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/exp_fp16.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/exp_fp16.c
index d1555953..93f005c8 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/exp_fp16.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/exp_fp16.c
@@ -20,8 +20,10 @@

 #if defined(ENABLE_NEON)
 static inline void simd_exp_fp16(float16x8_t input, float16_t *dst) {
-  static float16x8_t maxv = {88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f};
-  static float16x8_t minv = {-88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f};
+  static float16x8_t maxv = {88.72283935546875f, 88.72283935546875f, 88.72283935546875f, 88.72283935546875f,
+                             88.72283935546875f, 88.72283935546875f, 88.72283935546875f, 88.72283935546875f};
+  static float16x8_t minv = {-87.3365478515625f, -87.3365478515625f, -87.3365478515625f, -87.3365478515625f,
+                             -87.3365478515625f, -87.3365478515625f, -87.3365478515625f, -87.3365478515625f};
   input = vmaxq_f16(minv, vminq_f16(input, maxv));
   vst1q_f16(dst, VexpFp16(input));
 }
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.c
index 813237fa..614842a1 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.c
@@ -23,28 +23,38 @@
 #include "nnacl/fp16/cast_fp16.h"
 #include "nnacl/intrinsics/ms_simd_instructions_fp16.h"

-void PackLstmWeightFp32ToFp16(float16_t *dst, const float *src, int batch, int deep, int col, int col_align) {
+void PackLstmWeightFp32ToFp16(float16_t *dst, const float *src, int batch, int deep, int col, int col_align,
+                              const int32_t *order) {
   for (int i = 0; i < batch; i++) {
     const float *src_batch = src + i * col * deep;
-    float16_t *dst_batch = dst + i * col_align * deep;
+    float16_t *dst_batch = dst + (order == NULL ? i : order[i]) * col_align * deep;
+#ifdef ENABLE_ARM64
+    RowMajor2ColNMajorFp16(src_batch, dst_batch, col, deep, true);
+#else
     RowMajor2Col8MajorFp16(src_batch, dst_batch, col, deep, true);
+#endif
   }
 }

-void PackLstmWeightFp16(float16_t *dst, const float16_t *src, int batch, int deep, int col, int col_align) {
+void PackLstmWeightFp16(float16_t *dst, const float16_t *src, int batch, int deep, int col, int col_align,
+                        const int32_t *order) {
   for (int i = 0; i < batch; i++) {
     const float16_t *src_batch = src + i * col * deep;
-    float16_t *dst_batch = dst + i * col_align * deep;
+    float16_t *dst_batch = dst + (order == NULL ? i : order[i]) * col_align * deep;
+#ifdef ENABLE_ARM64
+    RowMajor2ColNMajorFp16(src_batch, dst_batch, col, deep, false);
+#else
     RowMajor2Col8MajorFp16(src_batch, dst_batch, col, deep, false);
+#endif
   }
 }

-void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col, int col_align,
-                            bool is_bidirectional) {
+void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional,
+                            const int32_t *order) {
   int unidirectional_batch = is_bidirectional ? batch / 2 : batch;
   for (int i = 0; i < unidirectional_batch; i++) {
     const float *src_batch = src + i * col;
-    float16_t *dst_batch = dst + i * col_align;
+    float16_t *dst_batch = dst + (order == NULL ? i : order[i]) * col_align;
     Float32ToFloat16(src_batch, dst_batch, col);
   }
   if (is_bidirectional) {
@@ -52,17 +62,18 @@ void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col
     float16_t *backward_dst = dst + unidirectional_batch * col_align;
     for (int i = 0; i < unidirectional_batch; i++) {
       const float *backward_src_batch = backward_src + i * col;
-      float16_t *backward_dst_batch = backward_dst + i * col_align;
+      float16_t *backward_dst_batch = backward_dst + (order == NULL ? i : order[i]) * col_align;
       Float32ToFloat16(backward_src_batch, backward_dst_batch, col);
     }
   }
 }

-void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col, int col_align, bool is_bidirectional) {
+void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col, int col_align, bool is_bidirectional,
+                      const int32_t *order) {
   int unidirectional_batch = is_bidirectional ? batch / 2 : batch;
   for (int i = 0; i < unidirectional_batch; i++) {
     const float16_t *src_batch = src + i * col;
-    float16_t *dst_batch = dst + i * col_align;
+    float16_t *dst_batch = dst + (order == NULL ? i : order[i]) * col_align;
     (void)memcpy(dst_batch, src_batch, col * sizeof(float16_t));
   }
   if (is_bidirectional) {
@@ -70,7 +81,7 @@ void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col,
     float16_t *backward_dst = dst + unidirectional_batch * col_align;
     for (int i = 0; i < unidirectional_batch; i++) {
       const float16_t *backward_src_batch = backward_src + i * col;
-      float16_t *backward_dst_batch = backward_dst + i * col_align;
+      float16_t *backward_dst_batch = backward_dst + (order == NULL ? i : order[i]) * col_align;
       (void)memcpy(backward_dst_batch, backward_src_batch, col * sizeof(float16_t));
     }
   }
@@ -152,13 +163,13 @@ void UpdateOutputFp16(float16_t *hidden_state, float16_t *output, const float16_
                       const LstmParameter *lstm_param) {
   int batch = lstm_param->batch_;
   int hidden_size = lstm_param->hidden_size_;
-  int project_size = lstm_param->project_size_;
+  int output_size = lstm_param->output_size_;
   float16_t *state_buffer = buffer[C5NUM];
   float16_t *hidden_buffer = weight_project ? buffer[C3NUM] : hidden_state;
   float16_t zoneout = lstm_param->zoneout_hidden_;
   if (!(zoneout >= -FLT_EPSILON && zoneout <= FLT_EPSILON)) {
-    (void)memcpy(state_buffer, hidden_state, batch * project_size * sizeof(float16_t));
-    ElementOptMulFp16(state_buffer, &zoneout, state_buffer, batch * project_size, false);
+    (void)memcpy(state_buffer, hidden_state, batch * output_size * sizeof(float16_t));
+    ElementOptMulFp16(state_buffer, &zoneout, state_buffer, batch * output_size, false);
   }

   TanhFp16(cell_state, hidden_buffer, batch * hidden_size);
@@ -166,19 +177,32 @@ void UpdateOutputFp16(float16_t *hidden_state, float16_t *output, const float16_

   if (weight_project) {
     float16_t *left_matrix = hidden_buffer;
+#ifdef ENABLE_ARM64
+    if (batch >= C4NUM) {
+      left_matrix = buffer[C6NUM];
+      RowMajor2ColLadder12MajorFp16(hidden_buffer, left_matrix, batch, hidden_size);
+    }
+#else
     if (batch != 1) {
       left_matrix = buffer[C6NUM];
       RowMajor2Col16MajorFp16(hidden_buffer, left_matrix, batch, hidden_size, false);
     }
-    LstmMatMulFp16(hidden_state, left_matrix, weight_project, project_bias, batch, hidden_size, project_size,
+#endif
+    LstmMatMulFp16(hidden_state, left_matrix, weight_project, project_bias, batch, hidden_size, output_size,
                    batch == 1);
   }
   if (!(zoneout >= -FLT_EPSILON && zoneout <= FLT_EPSILON)) {
-    ElementOptMulAccFp16(hidden_state, 1 - zoneout, state_buffer, batch * project_size);
+    ElementOptMulAccFp16(hidden_state, 1 - zoneout, state_buffer, batch * output_size);
   }
-  (void)memcpy(output, hidden_state, batch * project_size * sizeof(float16_t));
+  (void)memcpy(output, hidden_state, batch * output_size * sizeof(float16_t));
 }

+#ifdef ENABLE_ARM64
+void LstmMatMulFp16(float16_t *c, const float16_t *a, const float16_t *b, const float16_t *bias, int row, int deep,
+                    int col, bool is_vec) {
+  MatmulFp16OptV2(a, b, c, bias, ActType_No, deep, row, col, col, OutType_Nhwc);
+}
+#else
 void LstmMatMulFp16(float16_t *c, const float16_t *a, const float16_t *b, const float16_t *bias, int row, int deep,
                     int col, bool is_vec) {
   if (is_vec) {
@@ -188,11 +212,12 @@ void LstmMatMulFp16(float16_t *c, const float16_t *a, const float16_t *b, const
     MatMulFp16(a, b, c, bias, ActType_No, deep, row, col, col, OutType_Nhwc);
   }
 }
+#endif

 void UpdateLstmGateFp16(float16_t *gate_buffer, const float16_t *input, const float16_t *weight, const float16_t *bias,
                         int row, int deep, int col, int col_align, bool is_vec) {
   for (int i = 0; i < 4; i++) {
-    const float16_t *weight_i = weight + deep * col * i;
+    const float16_t *weight_i = weight + deep * col_align * i;
     const float16_t *bias_i = bias + col_align * i;
     float16_t *gate = gate_buffer + row * col * i;
     LstmMatMulFp16(gate, input, weight_i, bias_i, row, deep, col, is_vec);
@@ -207,16 +232,26 @@ void LstmStepUnitFp16(float16_t *output, float16_t *input_gate, float16_t *forge
   float16_t *state_gate = buffer[C3NUM];
   float16_t *cell_buffer = buffer[C4NUM];
   float16_t *hidden_buffer = buffer[C5NUM];
+#ifdef ENABLE_ARM64
+  if (lstm_param->batch_ <= C3NUM) {
+    UpdateLstmGateFp16(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_,
+                       lstm_param->hidden_size_, lstm_param->state_col_align_, false);
+  } else {
+    RowMajor2ColLadder12MajorFp16(hidden_state, packed_state, lstm_param->batch_, lstm_param->output_size_);
+    UpdateLstmGateFp16(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_,
+                       lstm_param->hidden_size_, lstm_param->state_col_align_, false);
+  }
+#else
   bool is_vec = lstm_param->batch_ == 1;
   if (is_vec) {
-    UpdateLstmGateFp16(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_,
-                       lstm_param->project_size_, lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec);
+    UpdateLstmGateFp16(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_,
+                       lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec);
   } else {
-    // pack state for matmul
-    RowMajor2Col16MajorFp16(hidden_state, packed_state, lstm_param->batch_, lstm_param->project_size_, false);
-    UpdateLstmGateFp16(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_,
-                       lstm_param->project_size_, lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec);
+    RowMajor2Col16MajorFp16(hidden_state, packed_state, lstm_param->batch_, lstm_param->output_size_, false);
+    UpdateLstmGateFp16(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_,
+                       lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec);
   }
+#endif
   ElementAddFp16(input_gate, state_gate, input_gate, lstm_param->batch_ * lstm_param->hidden_size_);
   ElementAddFp16(forget_gate, state_gate + lstm_param->batch_ * lstm_param->hidden_size_ * 2, forget_gate,
                  lstm_param->batch_ * lstm_param->hidden_size_);
@@ -247,24 +282,43 @@ void LstmStepUnitFp16(float16_t *output, float16_t *input_gate, float16_t *forge
   }

   if (!(lstm_param->zoneout_hidden_ >= -FLT_EPSILON && lstm_param->zoneout_hidden_ <= FLT_EPSILON)) {
-    (void)memcpy(hidden_state, hidden_buffer, lstm_param->batch_ * lstm_param->project_size_ * sizeof(float16_t));
+    (void)memcpy(hidden_state, hidden_buffer, lstm_param->batch_ * lstm_param->output_size_ * sizeof(float16_t));
   }
 }

-void LstmUnidirectionalFp16(float16_t *output, const float16_t *packed_input, const float16_t *weight_i,
-                            const float16_t *weight_h, const float16_t *input_bias, const float16_t *state_bias,
-                            const float16_t *weight_project, const float16_t *project_bias, float16_t *hidden_state,
-                            float16_t *cell_state, float16_t *buffer[C7NUM], const LstmParameter *lstm_param,
-                            bool is_backward) {
-  float16_t *gate = buffer[1];
+#ifdef ENABLE_ARM64
+void LstmGateCompute(float16_t *gate, const float16_t *input, const float16_t *weight_i, const float16_t *input_bias,
+                     const LstmParameter *lstm_param) {
+  int row_input = lstm_param->seq_len_ * lstm_param->batch_;
+  for (int i = 0; i < C4NUM; i++) {
+    const float16_t *weight_loop = weight_i + lstm_param->input_size_ * lstm_param->input_col_align_ * i;
+    const float16_t *bias_loop = input_bias + lstm_param->input_col_align_ * i;
+    float16_t *gate_loop = gate + lstm_param->seq_len_ * lstm_param->batch_ * lstm_param->hidden_size_ * i;
+    MatmulFp16OptV2(input, weight_loop, gate_loop, bias_loop, ActType_No, lstm_param->input_size_, row_input,
+                    lstm_param->hidden_size_, lstm_param->hidden_size_, OutType_Nhwc);
+  }
+}
+#else
+void LstmGateCompute(float16_t *gate, const float16_t *input, const float16_t *weight_i, const float16_t *input_bias,
+                     const LstmParameter *lstm_param) {
   for (int i = 0; i < C4NUM; i++) {
     const float16_t *weight_loop = weight_i + lstm_param->input_size_ * lstm_param->input_col_align_ * i;
     const float16_t *bias_loop = input_bias + lstm_param->input_col_align_ * i;
     float16_t *gate_loop = gate + lstm_param->seq_len_ * lstm_param->batch_ * lstm_param->hidden_size_ * i;
-    MatMulFp16(packed_input, weight_loop, gate_loop, bias_loop, ActType_No, lstm_param->input_size_,
+    MatMulFp16(input, weight_loop, gate_loop, bias_loop, ActType_No, lstm_param->input_size_,
                lstm_param->seq_len_ * lstm_param->batch_, lstm_param->hidden_size_, lstm_param->hidden_size_,
                OutType_Nhwc);
   }
+}
+#endif
+
+void LstmUnidirectionalFp16(float16_t *output, const float16_t *packed_input, const float16_t *weight_i,
+                            const float16_t *weight_h, const float16_t *input_bias, const float16_t *state_bias,
+                            const float16_t *weight_project, const float16_t *project_bias, float16_t *hidden_state,
+                            float16_t *cell_state, float16_t *buffer[C7NUM], const LstmParameter *lstm_param,
+                            bool is_backward) {
+  float16_t *gate = buffer[1];
+  LstmGateCompute(gate, packed_input, weight_i, input_bias, lstm_param);

   float16_t *input_gate = gate;
   float16_t *forget_gate = gate + lstm_param->seq_len_ * lstm_param->batch_ * lstm_param->hidden_size_ * 2;
@@ -287,26 +341,33 @@ void LstmFp16(float16_t *output, const float16_t *input, const float16_t *weight
               const float16_t *project_bias, float16_t *hidden_state, float16_t *cell_state, float16_t *buffer[C7NUM],
               const LstmParameter *lstm_param) {
   // forward
+#ifdef ENABLE_ARM64
+  const float16_t *packed_input = input;
+  if (lstm_param->batch_ * lstm_param->seq_len_ >= C4NUM) {
+    float16_t *temp_input = buffer[0];
+    RowMajor2ColLadder12MajorFp16(input, temp_input, lstm_param->seq_len_ * lstm_param->batch_,
+                                  lstm_param->input_size_);
+    packed_input = temp_input;
+  }
+#else
   float16_t *packed_input = buffer[0];
   RowMajor2Col16MajorFp16(input, packed_input, lstm_param->seq_len_ * lstm_param->batch_, lstm_param->input_size_,
                           false);
+#endif
   LstmUnidirectionalFp16(output, packed_input, weight_i, weight_h, input_bias, state_bias, weight_project, project_bias,
                          hidden_state, cell_state, buffer, lstm_param, false);

   // backward
   if (lstm_param->bidirectional_) {
     const float16_t *backward_weight_i = weight_i + 4 * lstm_param->input_col_align_ * lstm_param->input_size_;
-    const float16_t *backward_weight_h = weight_h + 4 * lstm_param->state_col_align_ * lstm_param->hidden_size_;
+    const float16_t *backward_weight_h = weight_h + 4 * lstm_param->state_col_align_ * lstm_param->output_size_;
     const float16_t *backward_input_bias = input_bias + 4 * lstm_param->input_col_align_;
     const float16_t *backward_state_bias = state_bias + 4 * lstm_param->state_col_align_;
     const float16_t *backward_weight_project =
-      weight_project ? weight_project + lstm_param->hidden_size_ * (lstm_param->batch_ == 1
-                                                                      ? lstm_param->project_size_
-                                                                      : UP_ROUND(lstm_param->project_size_, C8NUM))
-                     : NULL;
-    float16_t *backward_output = output + lstm_param->batch_ * lstm_param->hidden_size_;
+      weight_project ? weight_project + lstm_param->hidden_size_ * lstm_param->proj_col_align_ : NULL;
+    float16_t *backward_output = output + lstm_param->batch_ * lstm_param->output_size_;
     float16_t *backward_cell_state = cell_state + lstm_param->batch_ * lstm_param->hidden_size_;
-    float16_t *backward_hidden_state = hidden_state + lstm_param->batch_ * lstm_param->hidden_size_;
+    float16_t *backward_hidden_state = hidden_state + lstm_param->batch_ * lstm_param->output_size_;

     LstmUnidirectionalFp16(backward_output, packed_input, backward_weight_i, backward_weight_h, backward_input_bias,
                            backward_state_bias, backward_weight_project, project_bias, backward_hidden_state,
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.h
index f6f853b4..d6af9c78 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.h
@@ -21,13 +21,17 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void PackLstmWeightFp32ToFp16(float16_t *dst, const float *src, int batch, int deep, int col, int col_align);
+void PackLstmWeightFp32ToFp16(float16_t *dst, const float *src, int batch, int deep, int col, int col_align,
+                              const int32_t *order);

-void PackLstmWeightFp16(float16_t *dst, const float16_t *src, int batch, int deep, int col, int col_align);
+void PackLstmWeightFp16(float16_t *dst, const float16_t *src, int batch, int deep, int col, int col_align,
+                        const int32_t *order);

-void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional);
+void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional,
+                            const int32_t *order);

-void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col, int col_align, bool is_bidirectional);
+void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col, int col_align, bool is_bidirectional,
+                      const int32_t *order);

 void LstmMatMulFp16(float16_t *c, const float16_t *a, const float16_t *b, const float16_t *bias, int row, int deep,
                     int col, bool is_vec);
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.c
index 1aefbaf5..39dcb9ee 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.c
@@ -16,7 +16,7 @@

 #include "nnacl/fp16/matmul_fp16.h"

-static void Col2Row8SrcFromFp16(const void *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
+static void Col2Row8SrcFromFp16(const void *src_ptr, float16_t *dst_ptr, int row, int col) {
   int row_c8 = row / C8NUM * C8NUM;
   int col_c8 = col / C8NUM * C8NUM;
   const float16_t *src = (const float16_t *)src_ptr;
@@ -108,7 +108,7 @@ static void Col2Row8SrcFromFp16(const void *src_ptr, float16_t *dst_ptr, size_t
   }
 }

-static void Col2Row8SrcFromFp32(const void *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
+static void Col2Row8SrcFromFp32(const void *src_ptr, float16_t *dst_ptr, int row, int col) {
   int row_c8 = row / C8NUM * C8NUM;
   int col_c8 = col / C8NUM * C8NUM;
   int ci = 0;
@@ -410,17 +410,14 @@ void VecMatmulFp16(const float16_t *a, const float16_t *b, float16_t *c, const f
     int di = 0;
     for (; di < depth - C8NUM + 1; di += C8NUM) {
       float16x8_t av = vld1q_f16(a + di);
-      float16x8_t bv_0;
-      float16x8_t bv_1;
-      for (int i = 0; i < C8NUM; i += C2NUM) {
-        bv_0 = vld1q_f16(bv_base);                // bv_i为一行,8列数据
-        acc_0 = vfmaq_n_f16(acc_0, bv_0, av[i]);  // av[i]为向量中的一个值
-        bv_base += C8NUM;
-
-        bv_1 = vld1q_f16(bv_base);                    // bv_i为一行,8列数据
-        acc_0 = vfmaq_n_f16(acc_0, bv_1, av[i + 1]);  // av[i]为向量中的一个值
+      float16x8_t bv_0[C8NUM];
+      for (int i = 0; i < C8NUM; ++i) {
+        bv_0[i] = vld1q_f16(bv_base);
         bv_base += C8NUM;
       }
+      for (int i = 0; i < C8NUM; ++i) {
+        acc_0 = vfmaq_n_f16(acc_0, bv_0[i], av[i]);
+      }
     }
     if (di < depth) {
       for (; di < depth; ++di) {
@@ -636,8 +633,94 @@ void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si
 }

 #ifdef ENABLE_ARM64
-void RowMajor2ColNMajorFp16(const float16_t *src_ptr, float16_t *dst_ptr, int row, int col) {
-  // Col16Major ==> Col8Major ==> Col4Major
+void RowMajor2ColLadder12MajorFp16(const float16_t *src, float16_t *dst_ptr, int row, int col) {
+  // Col12Major ==> Col8Major ==> Col4Major
+  const float16_t *src_r = src;
+  float16_t *dst_r = dst_ptr;
+  int ri = 0;
+  size_t col8 = col / C8NUM * C8NUM;
+  // find 16 block unit
+  for (; ri <= row - C12NUM; ri += C12NUM) {
+    size_t ci = 0;
+    for (; ci < col8; ci += C8NUM) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C12NUM;
+      Transpose12x8ARM64Fp16(src_c, dst_c, col * C2NUM, C24NUM);
+    }
+    for (; ci < col; ci++) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C12NUM;
+      for (size_t i = 0; i < C12NUM; i++) {
+        dst_c[i] = src_c[i * col];
+      }
+    }
+    src_r += C12NUM * col;
+    dst_r += C12NUM * col;
+  }
+  for (; ri <= row - C8NUM; ri += C8NUM) {
+    size_t ci = 0;
+    for (; ci < col8; ci += C8NUM) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C8NUM;
+      Transpose8x8ARM64Fp16(src_c, dst_c, col * sizeof(float16_t), C8NUM * sizeof(float16_t));
+    }
+    for (; ci < col; ci++) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C8NUM;
+      for (size_t i = 0; i < C8NUM; i++) {
+        dst_c[i] = src_c[i * col];
+      }
+    }
+    src_r += C8NUM * col;
+    dst_r += C8NUM * col;
+  }
+  for (; ri <= row - C4NUM; ri += C4NUM) {
+    size_t ci = 0;
+    for (; ci < col8; ci += C8NUM) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C4NUM;
+      Transpose4x8ARM64Fp16(src_c, dst_c, col * sizeof(float16_t), C4NUM * sizeof(float16_t));
+    }
+    for (; ci < col; ci++) {
+      const float16_t *src_c = src_r + ci;
+      float16_t *dst_c = dst_r + ci * C4NUM;
+      for (size_t i = 0; i < C4NUM; i++) {
+        dst_c[i] = src_c[i * col];
+      }
+    }
+    src_r += C4NUM * col;
+    dst_r += C4NUM * col;
+  }
+  if (ri < row) {
+    memcpy(dst_r, src_r, (row - ri) * col * C2NUM);
+  }
+}
+
+void RowMajor2RowLadder12MajorFp16(const float16_t *src, float16_t *dst, int row, int col) {
+  // Row12 ==> Row8 ==> Row4
+  for (int r = 0; r < row; r++) {
+    int c = 0;
+    for (; c <= col - C12NUM; c += C12NUM) {
+      MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c);
+      MS_FLOAT16X4 src_data1 = MS_LD_F16(src + r * col + c + C8NUM);
+      MS_STQ_F16(dst + c / C12NUM * C12NUM * row + r * C12NUM, src_data);
+      MS_ST_F16(dst + c / C12NUM * C12NUM * row + r * C12NUM + C8NUM, src_data1);
+    }
+    for (; c <= col - C8NUM; c += C8NUM) {
+      MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c);
+      MS_STQ_F16(dst + c / C12NUM * C12NUM * row + r * C8NUM, src_data);
+    }
+    for (; c <= col - C4NUM; c += C4NUM) {
+      MS_FLOAT16X4 src_data = MS_LD_F16(src + r * col + c);
+      MS_ST_F16(dst + c / C4NUM * C4NUM * row + r * C4NUM, src_data);
+    }
+    for (; c < col; ++c) {
+      dst[c / C4NUM * C4NUM * row + r + c % C4NUM * row] = src[r * col + c];
+    }
+  }
+}
+
+void RowMajor2ColNMajorFp16srcFp16(const float16_t *src_ptr, float16_t *dst_ptr, int row, int col) {
   const float16_t *src_r = src_ptr;
   float16_t *dst_r = dst_ptr;
   int ri = 0;
@@ -702,6 +785,112 @@ void RowMajor2ColNMajorFp16(const float16_t *src_ptr, float16_t *dst_ptr, int ro
     dst_r += 1;
   }
 }
+
+void RowMajor2ColNMajorFp16(const void *src_ptr, float16_t *dst_ptr, int row, int col, bool is_fp32_src) {
+  // Col16Major ==> Col8Major ==> Col4Major
+  if (!is_fp32_src) {
+    RowMajor2ColNMajorFp16srcFp16((const float16_t *)src_ptr, dst_ptr, row, col);
+    return;
+  }
+  const float *src_r = src_ptr;
+  float16_t *dst_r = dst_ptr;
+  int ri = 0;
+  // find 16 block unit
+  for (; ri <= row - C16NUM; ri += C16NUM) {
+    for (int r = 0; r < C16NUM; ++r) {
+      for (int c = 0; c < col; ++c) {
+        dst_r[c * C16NUM + r % C16NUM] = src_r[r * col + c];
+      }
+    }
+    src_r += C16NUM * col;
+    dst_r += C16NUM * col;
+  }
+  for (; ri <= row - C8NUM; ri += C8NUM) {
+    for (int r = 0; r < C8NUM; ++r) {
+      for (int c = 0; c < col; ++c) {
+        dst_r[c * C8NUM + r % C8NUM] = src_r[r * col + c];
+      }
+    }
+    src_r += C8NUM * col;
+    dst_r += C8NUM * col;
+  }
+  for (; ri <= row - C4NUM; ri += C4NUM) {
+    for (int r = 0; r < C4NUM; ++r) {
+      for (int c = 0; c < col; ++c) {
+        dst_r[c * C4NUM + r % C4NUM] = src_r[r * col + c];
+      }
+    }
+    src_r += C4NUM * col;
+    dst_r += C4NUM * col;
+  }
+  for (; ri < row; ++ri) {
+    for (size_t i = 0; i < col; ++i) {
+      dst_r[i * C4NUM] = src_r[i];
+    }
+    src_r += col;
+    dst_r += 1;
+  }
+}
+
+void RowMajor2RowNMajorFp16(const void *src_ptr, float16_t *dst, int row, int col, bool is_fp32_src) {
+  // Row16 ==> Row8 ==> Row4
+  if (is_fp32_src) {
+    const float *src = (const float *)src_ptr;
+    for (int r = 0; r < row; r++) {
+      int c = 0;
+      for (; c <= col - C16NUM; c += C16NUM) {
+        const float *cur_src = src + r * col + c;
+        MS_FLOAT32X4X4 src_f32_data = {MS_LDQ_F32(cur_src), MS_LDQ_F32(cur_src + C4NUM), MS_LDQ_F32(cur_src + C8NUM),
+                                       MS_LDQ_F32(cur_src + C12NUM)};
+        MS_FLOAT16X4X4 res = {
+          MS_CVT_F16_F32(src_f32_data.val[0]),
+          MS_CVT_F16_F32(src_f32_data.val[1]),
+          MS_CVT_F16_F32(src_f32_data.val[2]),
+          MS_CVT_F16_F32(src_f32_data.val[3]),
+        };
+        MS_ST4_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM, res);
+      }
+      for (; c <= col - C8NUM; c += C8NUM) {
+        const float *cur_src = src + r * col + c;
+        MS_FLOAT32X4X2 src_f32_data = {MS_LDQ_F32(cur_src), MS_LDQ_F32(cur_src + C4NUM)};
+        MS_FLOAT16X4X2 res = {
+          MS_CVT_F16_F32(src_f32_data.val[0]),
+          MS_CVT_F16_F32(src_f32_data.val[1]),
+        };
+        MS_ST2_F16(dst + c / C8NUM * C8NUM * row + r * C8NUM, res);
+      }
+      for (; c <= col - C4NUM; c += C4NUM) {
+        MS_FLOAT16X4 src_data = MS_CVT_F16_F32(MS_LDQ_F32(src + r * col + c));
+        MS_ST_F16(dst + c / C4NUM * C4NUM * row + r * C4NUM, src_data);
+      }
+      for (; c < col; ++c) {
+        dst[c / C4NUM * C4NUM * row + r * C4NUM + c % C4NUM] = src[r * col + c];
+      }
+    }
+    return;
+  }
+  const float16_t *src = (const float16_t *)src_ptr;
+  for (int r = 0; r < row; r++) {
+    int c = 0;
+    for (; c <= col - C16NUM; c += C16NUM) {
+      MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c);
+      MS_FLOAT16X8 src_data1 = MS_LDQ_F16(src + r * col + c + C8NUM);
+      MS_STQ_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM, src_data);
+      MS_STQ_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM + C8NUM, src_data1);
+    }
+    for (; c <= col - C8NUM; c += C8NUM) {
+      MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c);
+      MS_STQ_F16(dst + c / C8NUM * C8NUM * row + r * C8NUM, src_data);
+    }
+    for (; c <= col - C4NUM; c += C4NUM) {
+      MS_FLOAT16X4 src_data = MS_LD_F16(src + r * col + c);
+      MS_ST_F16(dst + c / C4NUM * C4NUM * row + r * C4NUM, src_data);
+    }
+    for (; c < col; ++c) {
+      dst[c / C4NUM * C4NUM * row + r * C4NUM + c % C4NUM] = src[r * col + c];
+    }
+  }
+}
 #endif

 void RowMajor2Col12MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
@@ -802,32 +991,6 @@ void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col,
   }
 }

-#ifdef ENABLE_ARM64
-void RowMajor2RowNMajorFp16(const float16_t *src, float16_t *dst, int row, int col) {
-  // Row16 ==> Row8 ==> Row4
-  for (int r = 0; r < row; r++) {
-    int c = 0;
-    for (; c <= col - C16NUM; c += C16NUM) {
-      MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c);
-      MS_FLOAT16X8 src_data1 = MS_LDQ_F16(src + r * col + c + C8NUM);
-      MS_STQ_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM, src_data);
-      MS_STQ_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM + C8NUM, src_data1);
-    }
-    for (; c <= col - C8NUM; c += C8NUM) {
-      MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c);
-      MS_STQ_F16(dst + c / C8NUM * C8NUM * row + r * C8NUM, src_data);
-    }
-    for (; c <= col - C4NUM; c += C4NUM) {
-      MS_FLOAT16X4 src_data = MS_LD_F16(src + r * col + c);
-      MS_ST_F16(dst + c / C4NUM * C4NUM * row + r * C4NUM, src_data);
-    }
-    for (; c < col; ++c) {
-      dst[c / C4NUM * C4NUM * row + r * C4NUM + c % C4NUM] = src[r * col + c];
-    }
-  }
-}
-#endif
-
 void RowMajor2Row16MajorFp16Opt(const float16_t *src, float16_t *dst, int row, int col) {
   int col_align = UP_ROUND(col, C16NUM);
   for (int r = 0; r < row; r++) {
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.h
index be7f8443..7acef622 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */

-#ifndef NNACL_FP16_MATMUL_FP16_H_
-#define NNACL_FP16_MATMUL_FP16_H_
+#ifndef MINDSPORE_NNACL_FP16_MATMUL_H_
+#define MINDSPORE_NNACL_FP16_MATMUL_H_

 #include <float.h>
 #include <string.h>
@@ -45,9 +45,13 @@ void MatMul12x8Fp16(const float16_t *a, const float16_t *b, float16_t *dst, cons
                     int deep, int row, int col, int stride, int write_mode);

 #ifdef ENABLE_ARM64
-void RowMajor2ColNMajorFp16(const float16_t *src, float16_t *dst_ptr, int row, int col);
+void RowMajor2ColLadder12MajorFp16(const float16_t *src, float16_t *dst_ptr, int row, int col);

-void RowMajor2RowNMajorFp16(const float16_t *src, float16_t *dst, int row, int col);
+void RowMajor2RowLadder12MajorFp16(const float16_t *src, float16_t *dst, int row, int col);
+
+void RowMajor2ColNMajorFp16(const void *src, float16_t *dst_ptr, int row, int col, bool is_fp32_src);
+
+void RowMajor2RowNMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);

 void MatMul12x16Fp16Opt(const float16_t *a, const float16_t *b, float16_t *dst, const float16_t *bias, ActType act_type,
                         int deep, int row, int col, size_t stride, size_t out_type);
@@ -60,6 +64,9 @@ void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, c
 void MatmulBaseFp16Neon(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
                         size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);

+void MatmulFp16OptV2(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
+                     size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);
+
 #ifdef ENABLE_DEBUG
 void MatmulBaseFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
                     size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);
@@ -118,4 +125,4 @@ void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bo
 }
 #endif

-#endif  //  NNACL_FP16_MATMUL_FP16_H_
+#endif  // MINDSPORE_NNACL_FP16_MATMUL_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.c
index 74e75115..da9f6bef 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.c
@@ -33,7 +33,7 @@ static void PackLstmMatrix(const float *src_batch, float *dst_batch, int col, in
 }

 static void PackLstmWeightBatch(float *dst, const float *src, int batch, int deep, int col, int col_align,
-                                const int32_t *order) {
+                                const int *order) {
   for (int i = 0; i < batch; i++) {
     const float *src_batch = src + i * col * deep;
     float *dst_batch = dst + ((order == NULL) ? i : order[i]) * col_align * deep;
@@ -41,12 +41,12 @@ static void PackLstmWeightBatch(float *dst, const float *src, int batch, int dee
   }
 }

-void PackLstmWeight(float *dst, const float *src, int batch, int deep, int col, int col_align, const int32_t *order) {
+void PackLstmWeight(float *dst, const float *src, int batch, int deep, int col, int col_align, const int *order) {
   PackLstmWeightBatch(dst, src, batch, deep, col, col_align, order);
 }

 void PackLstmWeightWithStride(float *dst, const float *src, int batch, int deep, int col, int col_align,
-                              bool is_bidirectional, int stride, const int32_t *order) {
+                              bool is_bidirectional, int stride, const int *order) {
   int unidirectional_batch = is_bidirectional ? batch / 2 : batch;
   PackLstmWeightBatch(dst, src, unidirectional_batch, deep, col, col_align, order);
   src += stride;
@@ -57,7 +57,7 @@ void PackLstmWeightWithStride(float *dst, const float *src, int batch, int deep,
 }

 void PackLstmBias(float *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional,
-                  const int32_t *order) {
+                  const int *order) {
   int unidirectional_batch = is_bidirectional ? batch / 2 : batch;
   for (int i = 0; i < unidirectional_batch; i++) {
     const float *src_batch = src + i * col;
@@ -76,7 +76,7 @@ void PackLstmBias(float *dst, const float *src, int batch, int col, int col_alig
 }

 void PackLstmBiasWithStride(float *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional,
-                            int b_stride, const int32_t *order) {
+                            int b_stride, const int *order) {
   int unidirectional_batch = is_bidirectional ? batch / 2 : batch;
   for (int i = 0; i < unidirectional_batch; i++) {
     const float *src_batch = src + i * col;
@@ -175,13 +175,13 @@ void UpdateOutput(float *hidden_state, float *output, const float *cell_state, c
                   const float *weight_project, float *buffer[C8NUM], const LstmParameter *lstm_param) {
   int batch = lstm_param->batch_;
   int hidden_size = lstm_param->hidden_size_;
-  int project_size = lstm_param->project_size_;
+  int output_size = lstm_param->output_size_;
   float *state_buffer = buffer[C4NUM];
   float *hidden_buffer = weight_project ? buffer[C2NUM] : hidden_state;
   float zoneout = lstm_param->zoneout_hidden_;
   if (!(zoneout >= -FLT_EPSILON && zoneout <= FLT_EPSILON)) {
-    (void)memcpy(state_buffer, hidden_state, batch * project_size * sizeof(float));
-    ElementOptMul(state_buffer, &zoneout, state_buffer, batch * project_size, false);
+    (void)memcpy(state_buffer, hidden_state, batch * hidden_size * sizeof(float));
+    ElementOptMul(state_buffer, &zoneout, state_buffer, batch * hidden_size, false);
   }

   Tanh(cell_state, batch * hidden_size, hidden_buffer);
@@ -193,20 +193,13 @@ void UpdateOutput(float *hidden_state, float *output, const float *cell_state, c
       left_matrix = buffer[C6NUM];
       PackLstmInput(hidden_buffer, left_matrix, batch, hidden_size);
     }
-#ifdef ENABLE_AVX
-    int col_tile = batch == 1 ? C8NUM : C16NUM;
-#elif defined(ENABLE_ARM32)
-    int col_tile = C4NUM;
-#else
-    int col_tile = C8NUM;
-#endif
-    LstmMatMul(hidden_state, left_matrix, weight_project, NULL, batch, hidden_size, project_size,
-               UP_ROUND(project_size, col_tile), batch == 1, buffer[C7NUM]);
+    LstmMatMul(hidden_state, left_matrix, weight_project, NULL, batch, hidden_size, output_size,
+               lstm_param->proj_col_align_, batch == 1, buffer[C7NUM]);
   }
   if (!(zoneout >= -FLT_EPSILON && zoneout <= FLT_EPSILON)) {
-    ElementOptMulAcc(hidden_state, 1 - zoneout, state_buffer, batch * project_size);
+    ElementOptMulAcc(hidden_state, 1 - zoneout, state_buffer, batch * output_size);
   }
-  (void)memcpy(output, hidden_state, batch * project_size * sizeof(float));
+  (void)memcpy(output, hidden_state, batch * output_size * sizeof(float));
 }

 void UpdateLstmGate(float *gate_buffer, const float *input, const float *weight, const float *bias, int row, int deep,
@@ -238,12 +231,12 @@ void LstmStepUnit(float *output, float *input_gate, float *forget_gate, float *c
   bool is_vec = lstm_param->batch_ == 1;
   // state * weight
   if (is_vec) {
-    UpdateLstmGate(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_, lstm_param->project_size_,
+    UpdateLstmGate(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_,
                    lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec, packed_output);
   } else {
     // pack state for matmul
-    PackLstmInput(hidden_state, packed_state, lstm_param->batch_, lstm_param->project_size_);
-    UpdateLstmGate(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_, lstm_param->project_size_,
+    PackLstmInput(hidden_state, packed_state, lstm_param->batch_, lstm_param->output_size_);
+    UpdateLstmGate(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_,
                    lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec, packed_output);
   }
   ElementAdd(input_gate, state_gate, input_gate, lstm_param->batch_ * lstm_param->hidden_size_);
@@ -276,7 +269,7 @@ void LstmStepUnit(float *output, float *input_gate, float *forget_gate, float *c
   }

   if (!(lstm_param->zoneout_hidden_ >= -FLT_EPSILON && lstm_param->zoneout_hidden_ <= FLT_EPSILON)) {
-    (void)memcpy(hidden_state, hidden_buffer, lstm_param->batch_ * lstm_param->project_size_ * sizeof(float));
+    (void)memcpy(hidden_state, hidden_buffer, lstm_param->batch_ * lstm_param->output_size_ * sizeof(float));
   }
 }

@@ -322,12 +315,12 @@ void Lstm(float *output, const float *input, const float *weight_i, const float
   // backward
   if (lstm_param->bidirectional_) {
     const float *backward_weight_i = weight_i + 4 * lstm_param->input_col_align_ * lstm_param->input_size_;
-    const float *backward_weight_h = weight_h + 4 * lstm_param->state_col_align_ * lstm_param->hidden_size_;
+    const float *backward_weight_h = weight_h + 4 * lstm_param->state_col_align_ * lstm_param->output_size_;
     const float *backward_input_bias = input_bias + 4 * lstm_param->input_col_align_;
     const float *backward_state_bias = state_bias + 4 * lstm_param->state_col_align_;
-    float *backward_output = output + lstm_param->batch_ * lstm_param->hidden_size_;
+    float *backward_output = output + lstm_param->batch_ * lstm_param->output_size_;
     float *backward_cell_state = cell_state + lstm_param->batch_ * lstm_param->hidden_size_;
-    float *backward_hidden_state = hidden_state + lstm_param->batch_ * lstm_param->hidden_size_;
+    float *backward_hidden_state = hidden_state + lstm_param->batch_ * lstm_param->output_size_;

     LstmUnidirectional(backward_output, packed_input, backward_weight_i, backward_weight_h, backward_input_bias,
                        backward_state_bias, backward_hidden_state, backward_cell_state, buffer, lstm_param, true);
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.h
index 88dd9d16..f94f0bb7 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.h
@@ -21,16 +21,16 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-void PackLstmWeight(float *dst, const float *src, int batch, int deep, int col, int col_align, const int32_t *order);
+void PackLstmWeight(float *dst, const float *src, int batch, int deep, int col, int col_align, const int *order);

 void PackLstmWeightWithStride(float *dst, const float *src, int batch, int deep, int col, int col_align,
-                              bool is_bidirectional, int stride, const int32_t *order);
+                              bool is_bidirectional, int stride, const int *order);

 void PackLstmBias(float *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional,
-                  const int32_t *order);
+                  const int *order);

 void PackLstmBiasWithStride(float *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional,
-                            int b_stride, const int32_t *order);
+                            int b_stride, const int *order);

 void PackLstmInput(const float *src, float *dst, int row, int deep);

diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/matmul_fp32.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/matmul_fp32.c
index 308419fb..1898ffd4 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/matmul_fp32.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/matmul_fp32.c
@@ -440,8 +440,8 @@ void MatVecMulNoPackFp32(const float *a, const float *b, float *c, const float *
       }
       c[oc_index] = dst;
     }
-    a += k;
-    b += k * col;
+    a += C1500NUM;
+    b += C1500NUM * col;
   }
   if (k == depth) {
     return;
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.c
new file mode 100644
index 00000000..ad1cac2e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.c
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/infer/custom_gather_d_grad_v2_infer.h"
+#include "nnacl/infer/infer_register.h"
+
+int CustomGatherDGradV2InferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
+                                  size_t outputs_size, OpParameter *parameter) {
+  int check_ret = CheckAugmentNullSize(inputs, inputs_size, outputs, outputs_size, parameter, C3NUM, C1NUM);
+  if (check_ret != NNACL_OK) {
+    return check_ret;
+  }
+  const TensorC *input = inputs[0];
+  TensorC *output = outputs[0];
+  SetDataTypeFormat(output, input);
+  if (!InferFlag(inputs, inputs_size)) {
+    return NNACL_INFER_INVALID;
+  }
+  SetShapeTensor(output, input);
+  return NNACL_OK;
+}
+
+REG_INFER(CustomGatherDGradV2, PrimType_Inner_CustomGatherDGradV2, CustomGatherDGradV2InferShape)
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.h
new file mode 100644
index 00000000..68d85d20
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.h
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_INFER_H
+#define MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_INFER_H
+#include "nnacl/infer/common_infer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int CustomGatherDGradV2InferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs,
+                                  size_t outputs_size, OpParameter *parameter);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_INFER_H
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/lstm_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/lstm_infer.c
index 9892ef0b..391e2522 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/lstm_infer.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/lstm_infer.c
@@ -17,41 +17,81 @@
 #include "nnacl/infer/lstm_infer.h"
 #include "nnacl/infer/infer_register.h"

-static const int num_of_gates = 4;
-static const int no_of_recorde_values = 6;
+static const int no_of_recorde_values = 5;

 int CheckInputShapeValid(const TensorC *const *inputs, size_t inputs_size, const LstmParameter *parameter) {
+  if (inputs_size < C6NUM) {
+    return NNACL_INPUT_TENSOR_ERROR;
+  }
   const TensorC *input = inputs[FIRST_INPUT];
   const TensorC *weight_i = inputs[SECOND_INPUT];
   const TensorC *weight_g = inputs[THIRD_INPUT];
   const TensorC *bias = inputs[FOURTH_INPUT];
-  const TensorC *cell = inputs[FIFTH_INPUT];
+  const TensorC *hidden_init = inputs[FIFTH_INPUT];
+  const TensorC *cell_init = inputs[SIXTH_INPUT];
+
+  NNACL_CHECK_TRUE_RET(input->shape_size_ == DIMENSION_3D && weight_i->shape_size_ == DIMENSION_3D &&
+                         weight_g->shape_size_ == DIMENSION_3D && bias->shape_size_ == DIMENSION_2D,
+                       NNACL_ERR);
   int batch = input->shape_[kNHWC_H];
   int input_size = input->shape_[kNHWC_W];
   int hidden_size = weight_i->shape_[kNHWC_H] / C4NUM;
-  int project_size = inputs_size == C7NUM ? inputs[C6NUM]->shape_[kNHWC_H] : hidden_size;
-  bool bidirectional = parameter->bidirectional_;
-  if (input->shape_size_ != DIMENSION_3D || weight_i->shape_size_ != DIMENSION_3D) {
-    return NNACL_ERR;
+  int out_size = hidden_size;
+  if (inputs_size == C7NUM) {
+    NNACL_CHECK_TRUE_RET(inputs[SEVENTH_INPUT]->shape_size_ == DIMENSION_3D, NNACL_INPUT_TENSOR_ERROR);
+    out_size = inputs[SEVENTH_INPUT]->shape_[kNHWC_H];
   }
+  bool bidirectional = parameter->bidirectional_;
   int bidirection = bidirectional ? C2NUM : C1NUM;
   NNACL_CHECK_TRUE_RET(weight_i->shape_[kNHWC_N] == bidirection && weight_i->shape_[kNHWC_H] == hidden_size * C4NUM &&
                          weight_i->shape_[kNHWC_W] == input_size,
                        NNACL_ERR);
   NNACL_CHECK_TRUE_RET(weight_g->shape_[kNHWC_N] == bidirection && weight_g->shape_[kNHWC_H] == hidden_size * C4NUM &&
-                         weight_g->shape_[kNHWC_W] == project_size,
+                         weight_g->shape_[kNHWC_W] == out_size,
                        NNACL_ERR);
   NNACL_CHECK_TRUE_RET(bias->shape_[kNHWC_N] == bidirection && bias->shape_[kNHWC_H] == hidden_size * C8NUM, NNACL_ERR);
-  if (!bidirectional && cell->shape_size_ == DIMENSION_2D) {
-    NNACL_CHECK_TRUE_RET(cell->shape_[kNHWC_N] == batch && cell->shape_[kNHWC_H] == hidden_size, NNACL_ERR);
+  if (!bidirectional && hidden_init->shape_size_ == DIMENSION_2D) {
+    NNACL_CHECK_TRUE_RET(hidden_init->shape_[kNHWC_N] == batch && hidden_init->shape_[kNHWC_H] == out_size, NNACL_ERR);
   } else {
-    NNACL_CHECK_TRUE_RET(
-      cell->shape_[kNHWC_N] == bidirection && cell->shape_[kNHWC_H] == batch && cell->shape_[kNHWC_W] == project_size,
-      NNACL_ERR);
+    NNACL_CHECK_TRUE_RET(hidden_init->shape_size_ == DIMENSION_3D && hidden_init->shape_[kNHWC_N] == bidirection &&
+                           hidden_init->shape_[kNHWC_H] == batch && hidden_init->shape_[kNHWC_W] == out_size,
+                         NNACL_ERR);
+  }
+  if (!bidirectional && cell_init->shape_size_ == DIMENSION_2D) {
+    NNACL_CHECK_TRUE_RET(cell_init->shape_[kNHWC_N] == batch && cell_init->shape_[kNHWC_H] == hidden_size, NNACL_ERR);
+  } else {
+    NNACL_CHECK_TRUE_RET(cell_init->shape_size_ == DIMENSION_3D && cell_init->shape_[kNHWC_N] == bidirection &&
+                           cell_init->shape_[kNHWC_H] == batch && cell_init->shape_[kNHWC_W] == hidden_size,
+                         NNACL_ERR);
   }
   return NNACL_OK;
 }

+int InferFirstOutputMindir(const TensorC *const *inputs, size_t inputs_size, TensorC *output, LstmParameter *param) {
+  for (size_t i = 0; i < inputs_size; ++i) {
+    if (inputs[i]->shape_size_ != C3NUM) {
+      return NNACL_INPUT_TENSOR_ERROR;
+    }
+  }
+  ShapeSet(output->shape_, &output->shape_size_, inputs[0]->shape_, inputs[0]->shape_size_);
+  int out_size = inputs[SECOND_INPUT]->shape_[THIRD_INPUT];
+  output->shape_[THIRD_INPUT] = (param->bidirectional_ ? C2NUM : 1) * out_size;
+  return NNACL_OK;
+}
+
+int InferFirstOutputNonMindir(const TensorC *const *inputs, size_t inputs_size, TensorC *output, LstmParameter *param) {
+  if (CheckInputShapeValid(inputs, inputs_size, param) != NNACL_OK) {
+    return NNACL_ERR;
+  }
+  ShapeSet(output->shape_, &output->shape_size_, inputs[0]->shape_, inputs[0]->shape_size_);
+  const TensorC *hidden_init = inputs[FIFTH_INPUT];
+  int out_size = hidden_init->shape_[hidden_init->shape_size_ - 1];
+  output->shape_[THIRD_INPUT] = out_size;
+  int direction = param->bidirectional_ ? C2NUM : C1NUM;
+  int ret = ShapeInsert(output->shape_, &output->shape_size_, 1, direction);
+  return ret;
+}
+
 int LstmInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
                    OpParameter *parameter) {
   int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 4, 3);
@@ -60,9 +100,8 @@ int LstmInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o
   }

   const TensorC *input = inputs[0];
-  const TensorC *weight_i = inputs[1];
   TensorC *output = outputs[0];
-  for (int i = 0; i < 3; i++) {
+  for (int i = 0; i < outputs_size; i++) {
     SetDataTypeFormat(outputs[i], input);
   }

@@ -71,42 +110,31 @@ int LstmInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o
   if (!InferFlag(inputs, inputs_size)) {
     return NNACL_INFER_INVALID;
   }
-  int dir_multiplier = param->bidirectional_ ? 2 : 1;
-  int out_shape[MAX_SHAPE_SIZE];
-  size_t out_shape_size = 0;
-  int hidden_size = 1;
-  int project_size = 1;
-  ShapeSet(out_shape, &out_shape_size, input->shape_, input->shape_size_);
-  if (inputs_size == DIMENSION_4D) {  // if input from MINDIR
-    hidden_size = weight_i->shape_[THIRD_INPUT];
-    project_size = hidden_size;
-    out_shape[THIRD_INPUT] = hidden_size * dir_multiplier;
-  } else {
-    if (CheckInputShapeValid(inputs, inputs_size, param) != NNACL_OK) {
-      return NNACL_ERR;
+  int hidden_size = 0;
+  int out_size = 0;
+  if (inputs_size == C4NUM) {
+    int ret = InferFirstOutputMindir(inputs, inputs_size, output, param);
+    if (ret != NNACL_OK) {
+      return ret;
     }
-    hidden_size = weight_i->shape_[1] / num_of_gates;
-    project_size = inputs_size == C7NUM ? inputs[C6NUM]->shape_[kNHWC_H] : hidden_size;
-    out_shape[THIRD_INPUT] = project_size;
-    if (param->bidirectional_) {
-      int ret = ShapeInsert(out_shape, &out_shape_size, 1, 2);
-      if (ret != NNACL_OK) {
-        return NNACL_ERR;
-      }
-    } else {
-      int ret = ShapeInsert(out_shape, &out_shape_size, 1, 1);
-      if (ret != NNACL_OK) {
-        return NNACL_ERR;
-      }
+    hidden_size = inputs[THIRD_INPUT]->shape_[THIRD_INPUT];
+    out_size = inputs[SECOND_INPUT]->shape_[THIRD_INPUT];
+  } else {
+    int ret = InferFirstOutputNonMindir(inputs, inputs_size, output, param);
+    if (ret != NNACL_OK) {
+      return ret;
     }
+    hidden_size = inputs[SIXTH_INPUT]->shape_[inputs[SIXTH_INPUT]->shape_size_ - 1];
+    out_size = inputs[FIFTH_INPUT]->shape_[inputs[FIFTH_INPUT]->shape_size_ - 1];
   }
-  SetShapeArray(output, out_shape, out_shape_size);
+
+  int dir_multiplier = param->bidirectional_ ? C2NUM : C1NUM;
   int state_shape[MAX_SHAPE_SIZE];
   size_t state_shape_size = 0;

   ShapeSet(state_shape, &state_shape_size, input->shape_, input->shape_size_);
   state_shape[FIRST_INPUT] = dir_multiplier;
-  state_shape[THIRD_INPUT] = project_size;
+  state_shape[THIRD_INPUT] = out_size;
   SetShapeArray(outputs[SECOND_INPUT], state_shape, state_shape_size);
   state_shape[THIRD_INPUT] = hidden_size;
   SetShapeArray(outputs[THIRD_INPUT], state_shape, state_shape_size);
@@ -116,11 +144,9 @@ int LstmInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o
     const size_t intermediate_states_shape_size = 1;
     int batch_size = input->shape_[SECOND_INPUT];
     int seq_len = input->shape_[FIRST_INPUT];
-    intermediate_states_shape[FIRST_INPUT] = no_of_recorde_values * batch_size * hidden_size * seq_len * dir_multiplier;
-    SetDataTypeFormat(outputs[FOURTH_INPUT], inputs[FIRST_INPUT]);
+    intermediate_states_shape[FIRST_INPUT] =
+      batch_size * seq_len * dir_multiplier * (out_size + no_of_recorde_values * hidden_size);
     SetShapeArray(outputs[FOURTH_INPUT], intermediate_states_shape, intermediate_states_shape_size);
-
-    SetDataTypeFormat(outputs[FIFTH_INPUT], inputs[FIRST_INPUT]);
     SetShapeArray(outputs[FIFTH_INPUT], state_shape, state_shape_size);
   }

diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/reshape_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/reshape_infer.c
index 287e9de3..3c192df7 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/reshape_infer.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/reshape_infer.c
@@ -33,12 +33,14 @@ int CalShape(const int *data, const TensorC *const *inputs, int *out_shape, size
     }
     ShapePush(out_shape, out_shape_size, data[i]);
   }
-
+  if (size == 0) {
+    return NNACL_ERR;
+  }
   if ((int)(data[index]) == -1) {
     if (index >= MAX_SHAPE_SIZE) {
       return NNACL_ERR;
     }
-    out_shape[index] = size == 0 ? 0 : input_count / size;
+    out_shape[index] = input_count / size;
   }
   return NNACL_OK;
 }
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions.h
index 377993cd..6a933785 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions.h
@@ -308,7 +308,7 @@ static inline float simd_exp32_f32(float data) {
 #else
   data = MS_MAX32_F32(-88.0f, MS_MIN32_F32(88.0f, data));  // clamp(-88, 88)
 #endif
-  int integer = floor(data * 1.44269504088896341f + 0.5f);
+  int integer = data / param[0];
   float decimal = data - integer * param[0];
   fi int_exp;
   int_exp.i = (integer + 127) << 23;  // Approximate calculation : (integer + 127) << 23
@@ -324,14 +324,19 @@ static inline void simd_exp32(float src, float *dst) {
     int i;
   } fi;
   static float param[] = {0.693147f, 1.0f / 120, 1.0f / 24, 1.0f / 6, 1.0f / 2, 1.0f};  // log(2.0f)
-  src = MS_MAX32_F32(-88.0f, MS_MIN32_F32(88.0f, src));                                 // clamp(-88.0f, 88.0f)
+  src = MS_MAX32_F32(-87.3365478515625f, MS_MIN32_F32(88.72283935546875f, src));  // clamp(logf(FLT_MIN), logf(FLT_MAX))
   int integer = floor(src * 1.44269504088896341f + 0.5f);
   float decimal = src - integer * param[0];
   fi int_exp;
-  int_exp.i = (integer + 127) << 23;  // integer num approximate calculation : (x + 127) << 23
+  const int shift = 23;
+  const int bias = 126;
+  const float factor = 2;
+  // 2^n * exp(r) should be counted 2 * 2^(n - 1) * exp(r),
+  // because n may be 128, and it is not representable by fp32.
+  int_exp.i = (integer + bias) << shift;  // integer num 2^(n - 1) approximate calculation : ((x - 1) + 127) << 23
   const float decimal_exp =
     1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1]))));
-  *dst = int_exp.f * decimal_exp;
+  *dst = factor * int_exp.f * decimal_exp;
 }

 // define (float/int) data
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions_fp16.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions_fp16.h
index a29c4dbb..94ed4b89 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions_fp16.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions_fp16.h
@@ -94,9 +94,13 @@ static inline float16x4_t ms_vcvt_f16_f32(float32x4_t in) {

 #define MS_FLOAT16X8 float16x8_t
 #define MS_FLOAT16X4 float16x4_t
+#define MS_FLOAT16X4X4 float16x4x4_t
+#define MS_FLOAT16X4X2 float16x4x2_t
 #define MS_MOVQ_F16 vmovq_n_f16
 #define MS_STQ_F16(ptr, val) vst1q_f16(ptr, val)
 #define MS_ST_F16 vst1_f16
+#define MS_ST2_F16 vst2_f16
+#define MS_ST4_F16 vst4_f16
 #define MS_MINQ_F16 vminq_f16
 #define MS_MAXQ_F16 vmaxq_f16
 #define MS_LDQ_F16(ptr) vld1q_f16(ptr)
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_neon_instructions.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_neon_instructions.h
index c4bc34d9..fb38b452 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_neon_instructions.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_neon_instructions.h
@@ -25,6 +25,8 @@
 #define MS128_F32_GETI(src, i) src[i]
 #define MS_FLOAT32X4 float32x4_t
 #define MS_FLOAT128_F32 float32x4_t
+#define MS_FLOAT32X4X2 float32x4x2_t
+#define MS_FLOAT32X4X4 float32x4x4_t
 #define MS_INT32X4 int32x4_t
 #define MS_INT128_EPI32 int32x4_t
 #define MS_UINT32X4 uint32x4_t
@@ -222,29 +224,30 @@ static inline MS_FLOAT32X4 VexpFp32(MS_FLOAT32X4 input) {
     {1.0f / 6, 1.0f / 6, 1.0f / 6, 1.0f / 6},
     {0.5f, 0.5f, 0.5f, 0.5f},
     {1.0f, 1.0f, 1.0f, 1.0f},
-    {1.44269504088896341f, 1.44269504088896341f, 1.44269504088896341f, 1.44269504088896341f}};
+    {1.44269504088896341f, 1.44269504088896341f, 1.44269504088896341f, 1.44269504088896341f},
+    {2.0f, 2.0f, 2.0f, 2.0f}};
   static MS_FLOAT32X4 negative_flag = {-0.0f, -0.0f, -0.0f, -0.0f};

   MS_INT32X4 integer =
     MS_CVTQPS_EPI32(MS_FMADD128_F32(input, param[6], MS_OR128_F32(MS_AND128_F32(input, negative_flag), param[4])));
   MS_FLOAT32X4 decimal = MS_SUBQ_F32(input, MS_MULQ_F32(MS_CVTQEPI32_PS(integer), param[0]));
-  MS_INT32X4 int_exp = MS_SLLIQ_EPI32(MS_ADDQ_EPI32(integer, MS_MOVQ_EPI32(127)), 23);
+  MS_INT32X4 int_exp = MS_SLLIQ_EPI32(MS_ADDQ_EPI32(integer, MS_MOVQ_EPI32(126)), 23);
   MS_FLOAT32X4 tmp = MS_MULQ_F32(decimal, (MS_ADDQ_F32(param[2], MS_MULQ_F32(decimal, param[1]))));
   tmp = MS_MULQ_F32(decimal, MS_ADDQ_F32(param[4], MS_MULQ_F32(decimal, MS_ADDQ_F32(param[3], tmp))));
   MS_FLOAT32X4 decimal_exp = MS_ADDQ_F32(param[5], MS_MULQ_F32(decimal, MS_ADDQ_F32(param[5], tmp)));
-  return MS_MULQ_F32(decimal_exp, MS_CAST128_F32_S32(int_exp));
+  return MS_MULQ_F32(param[7], MS_MULQ_F32(decimal_exp, MS_CAST128_F32_S32(int_exp)));
 }

 static inline void simd_exp128(MS_FLOAT32X4 input, float *dst) {
-  static MS_FLOAT32X4 maxv = {88.0f, 88.0f, 88.0f, 88.0f};
-  static MS_FLOAT32X4 minv = {-88.0f, -88.0f, -88.0f, -88.0f};
+  static MS_FLOAT32X4 maxv = {88.72283935546875f, 88.72283935546875f, 88.72283935546875f, 88.72283935546875f};
+  static MS_FLOAT32X4 minv = {-87.3365478515625f, -87.3365478515625f, -87.3365478515625f, -87.3365478515625f};
   input = MS_MAXQ_F32(minv, MS_MINQ_F32(input, maxv));
   MS_STQ_F32(dst, VexpFp32(input));
 }

 static inline MS_FLOAT32X4 simd_exp128_f32(MS_FLOAT32X4 input) {
-  static MS_FLOAT32X4 maxv = {88.0f, 88.0f, 88.0f, 88.0f};
-  static MS_FLOAT32X4 minv = {-88.0f, -88.0f, -88.0f, -88.0f};
+  static MS_FLOAT32X4 maxv = {88.72283935546875f, 88.72283935546875f, 88.72283935546875f, 88.72283935546875f};
+  static MS_FLOAT32X4 minv = {-87.3365478515625f, -87.3365478515625f, -87.3365478515625f, -87.3365478515625f};
   input = MS_MAXQ_F32(minv, MS_MINQ_F32(input, maxv));
   return VexpFp32(input);
 }
@@ -286,18 +289,6 @@ static inline MS_FLOAT32X4 MS_TANHX4_F32(MS_FLOAT32X4 src) {
   return res;
 }

-static inline MS_FLOAT128_F32 SIMD_SIGN128_F32(MS_FLOAT128_F32 src) {
-  MS_FLOAT128_F32 abs_src = MS_ABS128_F32(src);
-  MS_FLOAT128_F32 src_tmp = MS_OR128_F32(src, MS_MOV128_F32(1.0f));
-  MS_FLOAT128_F32 sign = MS_DIV128_F32(abs_src, src_tmp);
-  return sign;
-}
-
-static inline MS_FLOAT128_F32 SIMD_SIGNABS128_F32(MS_FLOAT128_F32 src, MS_FLOAT128_F32 abs_src) {
-  MS_FLOAT128_F32 src_tmp = MS_OR128_F32(src, MS_MOV128_F32(1.0f));
-  return MS_DIV128_F32(abs_src, src_tmp);
-}
-
 #define MS_TANH128_F32 MS_TANHX4_F32

 static inline MS_FLOAT32X4 MS128_ERF_F32(MS_FLOAT32X4 src) {
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/lstm_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/lstm_parameter.h
index 9ecd8409..5baf10fa 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/lstm_parameter.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/lstm_parameter.h
@@ -25,6 +25,7 @@ typedef struct LstmParameter {
   int input_size_;
   int hidden_size_;
   int project_size_;
+  int output_size_;
   int seq_len_;
   int batch_;
   // other parameter
@@ -36,6 +37,8 @@ typedef struct LstmParameter {
   int input_col_align_;
   int state_row_align_;
   int state_col_align_;
+  int proj_col_align_;
+  bool has_bias_;
 } LstmParameter;

 #endif  // NNACL_LSTM_PARAMETER_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
index 895f7e3d..bd0d152c 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
@@ -562,6 +562,7 @@ enum PrimType {
   PrimType_Inner_CustomMaskedFill = 10014,
   PrimType_Inner_CustomTensorScatterMax = 10015,
   PrimType_Inner_CustomIsInf = 10016,
+  PrimType_Inner_CustomGatherDGradV2 = 10017,
   PrimType_InnerOpMax,
   PrimType_InnerOpMin = PrimType_Inner_ToFormat
 };
diff --git a/mindspore/core/mindrt/src/thread/threadpool.cc b/mindspore/core/mindrt/src/thread/threadpool.cc
index 2301be8c..342ffb7f 100644
--- a/mindspore/core/mindrt/src/thread/threadpool.cc
+++ b/mindspore/core/mindrt/src/thread/threadpool.cc
@@ -53,7 +53,7 @@ Worker::~Worker() {
 void Worker::CreateThread() { thread_ = std::make_unique<std::thread>(&Worker::Run, this); }

 void Worker::ReinitAfterFork() {
-  THREAD_INFO("worker %ld recreate thread after fork in child process", worker_id_);
+  THREAD_INFO("worker %zu recreate thread after fork in child process", worker_id_);
   if (cond_var_ != nullptr) {
     (void)cond_var_.release();
     cond_var_ = std::make_unique<std::condition_variable>();
diff --git a/mindspore/core/ops/base_operator.h b/mindspore/core/ops/base_operator.h
index 811a6000..23652e8e 100644
--- a/mindspore/core/ops/base_operator.h
+++ b/mindspore/core/ops/base_operator.h
@@ -75,7 +75,7 @@ class MIND_API OperatorRegisterHelper {
  public:
   OperatorRegisterHelper(const std::string &kname, const OperatorDefineFunc &fn) {
     OperatorRegister::GetInstance().SetOperatorMap(kname, fn);
-    (void)id_;  // make compiler happy on macos
+    //    (void)id_;  // make compiler happy on macos
   }

   ~OperatorRegisterHelper() = default;
diff --git a/mindspore/core/ops/grad/gather_d_grad_v2.cc b/mindspore/core/ops/grad/gather_d_grad_v2.cc
index 3ce5f887..c999ca88 100644
--- a/mindspore/core/ops/grad/gather_d_grad_v2.cc
+++ b/mindspore/core/ops/grad/gather_d_grad_v2.cc
@@ -75,6 +75,11 @@ TypePtr GatherDGradV2InferType(const PrimitivePtr &prim, const std::vector<Abstr
 }
 }  // namespace

+int64_t GatherDGradV2::get_dim() const {
+  auto value_ptr = this->GetAttr(kDim);
+  return GetValue<int64_t>(value_ptr);
+}
+
 AbstractBasePtr GatherDGradV2Infer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                    const std::vector<AbstractBasePtr> &input_args) {
   auto infer_type = GatherDGradV2InferType(primitive, input_args);
diff --git a/mindspore/core/ops/grad/gather_d_grad_v2.h b/mindspore/core/ops/grad/gather_d_grad_v2.h
index 94274e3b..40a6e412 100644
--- a/mindspore/core/ops/grad/gather_d_grad_v2.h
+++ b/mindspore/core/ops/grad/gather_d_grad_v2.h
@@ -25,6 +25,7 @@ class MIND_API GatherDGradV2 : public BaseOperator {
  public:
   MIND_API_BASE_MEMBER(GatherDGradV2);
   GatherDGradV2() : BaseOperator(kNameGatherDGradV2) { InitIOName({"x", "dim", "index", "grad"}, {"output"}); }
+  int64_t get_dim() const;
 };
 MIND_API abstract::AbstractBasePtr GatherDGradV2Infer(const abstract::AnalysisEnginePtr &,
                                                       const PrimitivePtr &primitive,
diff --git a/mindspore/core/ops/grad/lstm_grad.cc b/mindspore/core/ops/grad/lstm_grad.cc
index d51c4882..c25e0379 100644
--- a/mindspore/core/ops/grad/lstm_grad.cc
+++ b/mindspore/core/ops/grad/lstm_grad.cc
@@ -98,15 +98,22 @@ void LSTMGrad::set_zoneout_hidden(float zoneout_hidden) {

 float LSTMGrad::get_zoneout_hidden() const { return GetValue<float>(this->GetAttr(kZoneoutHidden)); }

+void LSTMGrad::set_proj_size(const int64_t proj_size) {
+  (void)CheckAndConvertUtils::CheckInteger(kProjection_size, proj_size, kGreaterThan, 0, this->name());
+  (void)AddAttr(kProjection_size, api::MakeValue(proj_size));
+}
+int64_t LSTMGrad::get_proj_size() const { return GetValue<int64_t>(GetAttr(kProjection_size)); }
+
 void LSTMGrad::Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias,
-                    const float dropout, const bool bidirectional, const float zoneout_cell,
-                    const float zoneout_hidden) {
+                    const float dropout, const bool bidirectional, const float zoneout_cell, const float zoneout_hidden,
+                    const int64_t proj_size) {
   this->set_input_size(input_size);
   this->set_hidden_size(hidden_size);
   this->set_num_layers(num_layers);
   this->set_has_bias(has_bias);
   this->set_dropout(dropout);
   this->set_bidirectional(bidirectional);
+  this->set_proj_size(proj_size);
   if (bidirectional) {
     constexpr int k2Directions = 2;
     this->set_num_directions(k2Directions);
diff --git a/mindspore/core/ops/grad/lstm_grad.h b/mindspore/core/ops/grad/lstm_grad.h
index 73272d55..f6eba32c 100644
--- a/mindspore/core/ops/grad/lstm_grad.h
+++ b/mindspore/core/ops/grad/lstm_grad.h
@@ -31,7 +31,7 @@ class MIND_API LSTMGrad : public BaseOperator {
   LSTMGrad() : BaseOperator(kNameLSTMGrad) {}
   void Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias,
             const float dropout, const bool bidirectional = false, const float zoneout_cell = 0.0f,
-            const float zoneout_hidden = 0.0f);
+            const float zoneout_hidden = 0.0f, const int64_t proj_size = 0);
   void set_input_size(const int64_t input_size);
   int64_t get_input_size() const;
   void set_hidden_size(const int64_t hidden_size);
@@ -51,6 +51,8 @@ class MIND_API LSTMGrad : public BaseOperator {
   void set_zoneout_hidden(float zoneout_hidden);
   float get_zoneout_hidden() const;
   int64_t get_good_ld(const int64_t dim, const int64_t type_size);
+  void set_proj_size(const int64_t proj_size);
+  int64_t get_proj_size() const;
 };
 }  // namespace ops
 }  // namespace mindspore
diff --git a/mindspore/core/ops/grad/lstm_grad_data.cc b/mindspore/core/ops/grad/lstm_grad_data.cc
index 573d26f4..2b25282c 100644
--- a/mindspore/core/ops/grad/lstm_grad_data.cc
+++ b/mindspore/core/ops/grad/lstm_grad_data.cc
@@ -91,15 +91,23 @@ void LSTMGradData::set_zoneout_hidden(float zoneout_hidden) {

 float LSTMGradData::get_zoneout_hidden() const { return GetValue<float>(this->GetAttr(kZoneoutHidden)); }

+void LSTMGradData::set_proj_size(const int64_t proj_size) {
+  (void)CheckAndConvertUtils::CheckInteger(kProjection_size, proj_size, kGreaterThan, 0, this->name());
+  (void)AddAttr(kProjection_size, api::MakeValue(proj_size));
+}
+
+int64_t LSTMGradData::get_proj_size() const { return GetValue<int64_t>(GetAttr(kProjection_size)); }
+
 void LSTMGradData::Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers,
                         const bool has_bias, const float dropout, const bool bidirectional, const float zoneout_cell,
-                        const float zoneout_hidden) {
+                        const float zoneout_hidden, const int64_t proj_size) {
   this->set_input_size(input_size);
   this->set_hidden_size(hidden_size);
   this->set_num_layers(num_layers);
   this->set_has_bias(has_bias);
   this->set_dropout(dropout);
   this->set_bidirectional(bidirectional);
+  this->set_proj_size(proj_size);
   if (bidirectional) {
     constexpr int k2Directions = 2;
     this->set_num_directions(k2Directions);
diff --git a/mindspore/core/ops/grad/lstm_grad_data.h b/mindspore/core/ops/grad/lstm_grad_data.h
index adcf2ee7..f93e3260 100644
--- a/mindspore/core/ops/grad/lstm_grad_data.h
+++ b/mindspore/core/ops/grad/lstm_grad_data.h
@@ -32,7 +32,7 @@ class MIND_API LSTMGradData : public BaseOperator {
   LSTMGradData() : BaseOperator(kNameLSTMGradData) {}
   void Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias,
             const float dropout, const bool bidirectional = false, const float zoneout_cell = 0.0f,
-            const float zoneout_hidden = 0.0f);
+            const float zoneout_hidden = 0.0f, const int64_t proj_size = 0);
   void set_input_size(const int64_t input_size);
   int64_t get_input_size() const;
   void set_hidden_size(const int64_t hidden_size);
@@ -52,6 +52,8 @@ class MIND_API LSTMGradData : public BaseOperator {
   void set_zoneout_hidden(float zoneout_hidden);
   float get_zoneout_hidden() const;
   int64_t get_good_ld(const int64_t dim, const int64_t type_size);
+  void set_proj_size(const int64_t proj_size);
+  int64_t get_proj_size() const;
 };
 MIND_API abstract::AbstractBasePtr LstmGradDataInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                                      const std::vector<abstract::AbstractBasePtr> &input_args);
diff --git a/mindspore/core/ops/grad/lstm_grad_weight.cc b/mindspore/core/ops/grad/lstm_grad_weight.cc
index 22b519c3..ce0aca94 100644
--- a/mindspore/core/ops/grad/lstm_grad_weight.cc
+++ b/mindspore/core/ops/grad/lstm_grad_weight.cc
@@ -88,15 +88,23 @@ void LSTMGradWeight::set_zoneout_hidden(float zoneout_hidden) {

 float LSTMGradWeight::get_zoneout_hidden() const { return GetValue<float>(this->GetAttr(kZoneoutHidden)); }

+void LSTMGradWeight::set_proj_size(const int64_t proj_size) {
+  (void)CheckAndConvertUtils::CheckInteger(kProjection_size, proj_size, kGreaterThan, 0, this->name());
+  (void)AddAttr(kProjection_size, api::MakeValue(proj_size));
+}
+
+int64_t LSTMGradWeight::get_proj_size() const { return GetValue<int64_t>(GetAttr(kProjection_size)); }
+
 void LSTMGradWeight::Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers,
                           const bool has_bias, const float dropout, const bool bidirectional, const float zoneout_cell,
-                          const float zoneout_hidden) {
+                          const float zoneout_hidden, const int64_t proj_size) {
   this->set_input_size(input_size);
   this->set_hidden_size(hidden_size);
   this->set_num_layers(num_layers);
   this->set_has_bias(has_bias);
   this->set_dropout(dropout);
   this->set_bidirectional(bidirectional);
+  this->set_proj_size(proj_size);
   if (bidirectional) {
     constexpr int k2Directions = 2;
     this->set_num_directions(k2Directions);
diff --git a/mindspore/core/ops/grad/lstm_grad_weight.h b/mindspore/core/ops/grad/lstm_grad_weight.h
index c2ca6b5e..add816d3 100644
--- a/mindspore/core/ops/grad/lstm_grad_weight.h
+++ b/mindspore/core/ops/grad/lstm_grad_weight.h
@@ -32,7 +32,7 @@ class MIND_API LSTMGradWeight : public BaseOperator {
   LSTMGradWeight() : BaseOperator(kNameLSTMGradWeight) {}
   void Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias,
             const float dropout, const bool bidirectional = false, const float zoneout_cell = 0.0f,
-            const float zoneout_hidden = 0.0f);
+            const float zoneout_hidden = 0.0f, const int64_t proj_size = 0);
   void set_input_size(const int64_t input_size);
   int64_t get_input_size() const;
   void set_hidden_size(const int64_t hidden_size);
@@ -52,6 +52,8 @@ class MIND_API LSTMGradWeight : public BaseOperator {
   void set_zoneout_hidden(float zoneout_hidden);
   float get_zoneout_hidden() const;
   int64_t get_good_ld(const int64_t dim, const int64_t type_size);
+  void set_proj_size(const int64_t proj_size);
+  int64_t get_proj_size() const;
 };
 MIND_API abstract::AbstractBasePtr LstmGradWeightInfer(const abstract::AnalysisEnginePtr &,
                                                        const PrimitivePtr &primitive,
diff --git a/mindspore/core/ops/lstm.cc b/mindspore/core/ops/lstm.cc
index 43b9241c..937207df 100644
--- a/mindspore/core/ops/lstm.cc
+++ b/mindspore/core/ops/lstm.cc
@@ -68,6 +68,7 @@ abstract::TupleShapePtr LSTMInferShape(const PrimitivePtr &primitive, const std:
   int64_t input_x_size = GetValue<int64_t>(primitive->GetAttr(kInput_size));
   int64_t num_layers = GetValue<int64_t>(primitive->GetAttr(kNumLayers));
   bool bidirectional = GetValue<bool>(primitive->GetAttr(kBidirectional));
+  int64_t proj_size = GetValue<int64_t>(primitive->GetAttr(kProjection_size));
   int64_t num_directions = 1;
   if (bidirectional) {
     num_directions = 2;
@@ -90,7 +91,8 @@ abstract::TupleShapePtr LSTMInferShape(const PrimitivePtr &primitive, const std:
     (void)CheckAndConvertUtils::CheckInteger("h_shape[1]", h_input_shape[1], kEqual, x_input_shape[1], prim_name);
   }

-  std::vector<int64_t> y_shape = {x_input_shape[0], x_input_shape[1], hidden_size * num_directions};
+  auto real_hidden_size = proj_size > 0 ? proj_size : hidden_size;
+  std::vector<int64_t> y_shape = {x_input_shape[0], x_input_shape[1], real_hidden_size * num_directions};
   std::vector<int64_t> h_shape = {h_input_shape};
   std::vector<int64_t> c_shape = {c_input_shape};
   std::vector<int64_t> reverse_shape = {1, 1};
@@ -135,6 +137,11 @@ void LSTM::set_hidden_size(const int64_t hidden_size) {
   (void)AddAttr(kHidden_size, api::MakeValue(hidden_size));
 }
 int64_t LSTM::get_hidden_size() const { return GetValue<int64_t>(GetAttr(kHidden_size)); }
+void LSTM::set_proj_size(const int64_t proj_size) {
+  (void)CheckAndConvertUtils::CheckInteger(kProjection_size, proj_size, kGreaterThan, 0, this->name());
+  (void)AddAttr(kProjection_size, api::MakeValue(proj_size));
+}
+int64_t LSTM::get_proj_size() const { return GetValue<int64_t>(GetAttr(kProjection_size)); }
 void LSTM::set_num_layers(const int64_t num_layers) {
   (void)CheckAndConvertUtils::CheckInteger(kNumLayers, num_layers, kGreaterThan, 0, this->name());
   (void)AddAttr(kNumLayers, api::MakeValue(num_layers));
diff --git a/mindspore/core/ops/lstm.h b/mindspore/core/ops/lstm.h
index 4d3c8756..e32c5781 100644
--- a/mindspore/core/ops/lstm.h
+++ b/mindspore/core/ops/lstm.h
@@ -51,6 +51,12 @@ class MIND_API LSTM : public BaseOperator {
   ///
   /// \return hidden_size.
   int64_t get_hidden_size() const;
+  /// \brief Set proj_size.
+  void set_proj_size(const int64_t proj_size);
+  /// \brief Get proj_size.
+  ///
+  /// \return proj_size.
+  int64_t get_proj_size() const;
   /// \brief Set num_layers.
   void set_num_layers(const int64_t num_layers);
   /// \brief Get num_layers.
diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h
index ce68079f..ad9066e7 100644
--- a/mindspore/core/ops/op_name.h
+++ b/mindspore/core/ops/op_name.h
@@ -268,6 +268,7 @@ constexpr auto kWindowSize = "window_size";
 constexpr auto kPaddings = "paddings";
 constexpr auto kInput_size = "input_size";
 constexpr auto kHidden_size = "hidden_size";
+constexpr auto kProjection_size = "proj_size";
 constexpr auto kChannelShared = "channel_shared";
 constexpr auto kSlope = "slope";
 constexpr auto kBase = "base";
diff --git a/mindspore/lite/BUILD.gn b/mindspore/lite/BUILD.gn
index f7e465e2..9318d54e 100644
--- a/mindspore/lite/BUILD.gn
+++ b/mindspore/lite/BUILD.gn
@@ -602,6 +602,8 @@ all_train_sources = [
   "src/train/optimizer/fusion/matmul_activation_fusion_pass.cc",
   "src/train/optimizer/fusion/reshape_gather_reshape_fusion_pass.cc",
   "src/train/optimizer/fusion/gru_fusion_pass.cc",
+  "src/train/optimizer/fusion/matmul_add_fusion_pass.cc",
+  "src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc",
   "src/common/storage.cc",
   "tools/converter/optimizer.cc",
   "tools/converter/legacy_optimizer/fusion/fusion_pass.cc",
@@ -646,6 +648,7 @@ fp32_train_kernel_sources = [
   "src/litert/kernel/cpu/fp32_grad/convolution.cc",
   "src/litert/kernel/cpu/fp32_grad/convolution_grad_filter.cc",
   "src/litert/kernel/cpu/fp32_grad/convolution_grad_input.cc",
+  "src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.cc",
   "src/litert/kernel/cpu/fp32_grad/deconvolution_grad_filter.cc",
   "src/litert/kernel/cpu/fp32_grad/dropout.cc",
   "src/litert/kernel/cpu/fp32_grad/dropout_grad.cc",
diff --git a/mindspore/lite/CMakeLists.txt b/mindspore/lite/CMakeLists.txt
index 1faf2f38..f2b5809f 100644
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@@ -977,7 +977,7 @@ if(MSLITE_MINDDATA_IMPLEMENT STREQUAL "lite" OR MSLITE_MINDDATA_IMPLEMENT STREQU
 endif()

 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/common/ops)
-if(ANDROID_NDK_TOOLCHAIN_INCLUDED OR TARGET_OHOS_LITE OR TARGET_HIMIX)
+if(ANDROID_NDK_TOOLCHAIN_INCLUDED OR TARGET_OHOS_LITE OR TARGET_HIMIX OR TARGET_OHOS)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/converter/micro/coder)
 endif()

diff --git a/mindspore/lite/schema/inner/ops_generated.h b/mindspore/lite/schema/inner/ops_generated.h
index c4fd8c15..6c861aa5 100644
--- a/mindspore/lite/schema/inner/ops_generated.h
+++ b/mindspore/lite/schema/inner/ops_generated.h
@@ -11338,6 +11338,7 @@ struct LSTMT : public flatbuffers::NativeTable {
   float dropout = 0.0f;
   float zoneout_cell = 0.0f;
   float zoneout_hidden = 0.0f;
+  int64_t proj_size = 0;
 };

 struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -11355,7 +11356,8 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_NUM_DIRECTIONS = 14,
     VT_DROPOUT = 16,
     VT_ZONEOUT_CELL = 18,
-    VT_ZONEOUT_HIDDEN = 20
+    VT_ZONEOUT_HIDDEN = 20,
+    VT_PROJ_SIZE = 22
   };
   bool bidirectional() const {
     return GetField<uint8_t>(VT_BIDIRECTIONAL, 0) != 0;
@@ -11411,6 +11413,12 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool mutate_zoneout_hidden(float _zoneout_hidden) {
     return SetField<float>(VT_ZONEOUT_HIDDEN, _zoneout_hidden, 0.0f);
   }
+  int64_t proj_size() const {
+    return GetField<int64_t>(VT_PROJ_SIZE, 0);
+  }
+  bool mutate_proj_size(int64_t _proj_size) {
+    return SetField<int64_t>(VT_PROJ_SIZE, _proj_size, 0);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_BIDIRECTIONAL) &&
@@ -11422,6 +11430,7 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<float>(verifier, VT_DROPOUT) &&
            VerifyField<float>(verifier, VT_ZONEOUT_CELL) &&
            VerifyField<float>(verifier, VT_ZONEOUT_HIDDEN) &&
+           VerifyField<int64_t>(verifier, VT_PROJ_SIZE) &&
            verifier.EndTable();
   }
   LSTMT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -11460,6 +11469,9 @@ struct LSTMBuilder {
   void add_zoneout_hidden(float zoneout_hidden) {
     fbb_.AddElement<float>(LSTM::VT_ZONEOUT_HIDDEN, zoneout_hidden, 0.0f);
   }
+  void add_proj_size(int64_t proj_size) {
+    fbb_.AddElement<int64_t>(LSTM::VT_PROJ_SIZE, proj_size, 0);
+  }
   explicit LSTMBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -11481,8 +11493,10 @@ inline flatbuffers::Offset<LSTM> CreateLSTM(
     int64_t num_directions = 0,
     float dropout = 0.0f,
     float zoneout_cell = 0.0f,
-    float zoneout_hidden = 0.0f) {
+    float zoneout_hidden = 0.0f,
+    int64_t proj_size = 0) {
   LSTMBuilder builder_(_fbb);
+  builder_.add_proj_size(proj_size);
   builder_.add_num_directions(num_directions);
   builder_.add_num_layers(num_layers);
   builder_.add_hidden_size(hidden_size);
@@ -23571,6 +23585,7 @@ inline void LSTM::UnPackTo(LSTMT *_o, const flatbuffers::resolver_function_t *_r
   { auto _e = dropout(); _o->dropout = _e; }
   { auto _e = zoneout_cell(); _o->zoneout_cell = _e; }
   { auto _e = zoneout_hidden(); _o->zoneout_hidden = _e; }
+  { auto _e = proj_size(); _o->proj_size = _e; }
 }

 inline flatbuffers::Offset<LSTM> LSTM::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -23590,6 +23605,7 @@ inline flatbuffers::Offset<LSTM> CreateLSTM(flatbuffers::FlatBufferBuilder &_fbb
   auto _dropout = _o->dropout;
   auto _zoneout_cell = _o->zoneout_cell;
   auto _zoneout_hidden = _o->zoneout_hidden;
+  auto _proj_size = _o->proj_size;
   return mindspore::schema::CreateLSTM(
       _fbb,
       _bidirectional,
@@ -23600,7 +23616,8 @@ inline flatbuffers::Offset<LSTM> CreateLSTM(flatbuffers::FlatBufferBuilder &_fbb
       _num_directions,
       _dropout,
       _zoneout_cell,
-      _zoneout_hidden);
+      _zoneout_hidden,
+      _proj_size);
 }

 inline LSTMGradT *LSTMGrad::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs
index 76caf810..920c0d31 100644
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -688,6 +688,7 @@ table LSTM {
     dropout: float;
     zoneout_cell: float = 0;
     zoneout_hidden: float = 0;
+    proj_size: long = 0;
 }

 table LSTMGrad {
diff --git a/mindspore/lite/schema/ops_generated.h b/mindspore/lite/schema/ops_generated.h
index 2f792706..8d387e9d 100644
--- a/mindspore/lite/schema/ops_generated.h
+++ b/mindspore/lite/schema/ops_generated.h
@@ -7046,7 +7046,8 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_NUM_DIRECTIONS = 14,
     VT_DROPOUT = 16,
     VT_ZONEOUT_CELL = 18,
-    VT_ZONEOUT_HIDDEN = 20
+    VT_ZONEOUT_HIDDEN = 20,
+    VT_PROJ_SIZE = 22
   };
   bool bidirectional() const {
     return GetField<uint8_t>(VT_BIDIRECTIONAL, 0) != 0;
@@ -7075,6 +7076,9 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   float zoneout_hidden() const {
     return GetField<float>(VT_ZONEOUT_HIDDEN, 0.0f);
   }
+  int64_t proj_size() const {
+    return GetField<int64_t>(VT_PROJ_SIZE, 0);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_BIDIRECTIONAL) &&
@@ -7086,6 +7090,7 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<float>(verifier, VT_DROPOUT) &&
            VerifyField<float>(verifier, VT_ZONEOUT_CELL) &&
            VerifyField<float>(verifier, VT_ZONEOUT_HIDDEN) &&
+           VerifyField<int64_t>(verifier, VT_PROJ_SIZE) &&
            verifier.EndTable();
   }
 };
@@ -7121,6 +7126,9 @@ struct LSTMBuilder {
   void add_zoneout_hidden(float zoneout_hidden) {
     fbb_.AddElement<float>(LSTM::VT_ZONEOUT_HIDDEN, zoneout_hidden, 0.0f);
   }
+  void add_proj_size(int64_t proj_size) {
+    fbb_.AddElement<int64_t>(LSTM::VT_PROJ_SIZE, proj_size, 0);
+  }
   explicit LSTMBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -7142,8 +7150,10 @@ inline flatbuffers::Offset<LSTM> CreateLSTM(
     int64_t num_directions = 0,
     float dropout = 0.0f,
     float zoneout_cell = 0.0f,
-    float zoneout_hidden = 0.0f) {
+    float zoneout_hidden = 0.0f,
+    int64_t proj_size = 0) {
   LSTMBuilder builder_(_fbb);
+  builder_.add_proj_size(proj_size);
   builder_.add_num_directions(num_directions);
   builder_.add_num_layers(num_layers);
   builder_.add_hidden_size(hidden_size);
diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index de1781cd..469bcb6b 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -337,6 +337,8 @@ set(TRAIN_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/common/fusion_utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/gru_fusion_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/matmul_activation_fusion_pass.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/matmul_add_fusion_pass.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/reshape_gather_reshape_fusion_pass.cc
         ${TOOLS_DIR}/converter/optimizer.cc
         ${TOOLS_DIR}/converter/legacy_optimizer/fusion/fusion_pass.cc
diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc
index e5c7f5ca..baa2497a 100644
--- a/mindspore/lite/src/common/ops/ops_def.cc
+++ b/mindspore/lite/src/common/ops/ops_def.cc
@@ -688,6 +688,7 @@ OP_ATTR(num_directions, long)
 OP_ATTR(dropout, float)
 OP_ATTR_WITH_VALUE(zoneout_cell, float, 0)
 OP_ATTR_WITH_VALUE(zoneout_hidden, float, 0)
+OP_ATTR_WITH_VALUE(proj_size, long, 0)
 OP_SCHEMA_DEF_END(LSTM)

 OP_SCHEMA_DEF(LSTMGrad)
diff --git a/mindspore/lite/src/common/ops/populate/custom_populate.cc b/mindspore/lite/src/common/ops/populate/custom_populate.cc
index 13957ed7..6c490130 100644
--- a/mindspore/lite/src/common/ops/populate/custom_populate.cc
+++ b/mindspore/lite/src/common/ops/populate/custom_populate.cc
@@ -22,6 +22,7 @@
 #include "nnacl/custom_masked_fill_parameter.h"
 #include "nnacl/custom_is_inf_parameter.h"
 #include "nnacl/custom_tensor_scatter_max_parameter.h"
+#include "nnacl/custom_gather_d_grad_v2_parameter.h"
 using mindspore::schema::PrimitiveType_Custom;

 namespace mindspore {
@@ -128,6 +129,33 @@ OpParameter *CreateCustomMaskedFillParameter() {
   return reinterpret_cast<OpParameter *>(param);
 }

+OpParameter *CreateCustomGatherDGradV2Parameter(const schema::Custom *value) {
+  if (value->attr()->size() < 1) {
+    return nullptr;
+  }
+  auto *param = static_cast<CustomGatherGradV2Parameter *>(malloc(sizeof(CustomGatherGradV2Parameter)));
+  if (param == nullptr) {
+    MS_LOG(ERROR) << "malloc CustomGruParameter failed.";
+    return nullptr;
+  }
+
+  std::string dim_str;
+  auto attrs = value->attr();
+  for (size_t i = 0; i < attrs->size(); i++) {
+    auto attr = attrs->Get(i);
+    if (attr->name()->str() == "dim") {
+      auto data = attr->data();
+      dim_str = std::string(reinterpret_cast<const char *>(data->Data()), data->size());
+      break;
+    }
+  }
+
+  memset(param, 0, sizeof(CustomGatherGradV2Parameter));
+  param->dim = std::stoi(dim_str.c_str());
+  param->op_parameter_.type_ = PrimType_Inner_CustomGatherDGradV2;
+  return reinterpret_cast<OpParameter *>(param);
+}
+
 OpParameter *PopulateCustomParameter(const void *prim) {
   MS_CHECK_TRUE_RET(prim != nullptr, nullptr);
   auto primitive = static_cast<const schema::Primitive *>(prim);
@@ -167,6 +195,8 @@ OpParameter *PopulateCustomParameter(const void *prim) {
     return CreateCustomGruParameter();
   } else if (type == "CastGatherReduceFusion") {
     return CreateParam(PrimType_Inner_CastGatherReduceFusion);
+  } else if (type == "GatherDGradV2") {
+    return CreateCustomGatherDGradV2Parameter(value);
   } else if (type == "ThirdPartyModel") {
     auto *param = static_cast<CustomParameter *>(malloc(sizeof(CustomParameter)));
     if (param == nullptr) {
diff --git a/mindspore/lite/src/common/ops/populate/lstm_populate.cc b/mindspore/lite/src/common/ops/populate/lstm_populate.cc
index 522da7ad..b3a85b64 100644
--- a/mindspore/lite/src/common/ops/populate/lstm_populate.cc
+++ b/mindspore/lite/src/common/ops/populate/lstm_populate.cc
@@ -37,8 +37,12 @@ OpParameter *PopulateLstmParameter(const void *prim) {

   param->op_parameter_.type_ = primitive->value_type();
   param->bidirectional_ = value->bidirectional();
+  param->has_bias_ = value->has_bias();
+  param->input_size_ = value->input_size();
+  param->hidden_size_ = value->hidden_size();
   param->zoneout_cell_ = value->zoneout_cell();
   param->zoneout_hidden_ = value->zoneout_hidden();
+  param->project_size_ = value->proj_size();
   return reinterpret_cast<OpParameter *>(param);
 }

diff --git a/mindspore/lite/src/common/prim_util.cc b/mindspore/lite/src/common/prim_util.cc
index 5ded05e9..7263775a 100644
--- a/mindspore/lite/src/common/prim_util.cc
+++ b/mindspore/lite/src/common/prim_util.cc
@@ -29,11 +29,25 @@ static std::set<schema::PrimitiveType> kTensorListOps = {
   schema::PrimitiveType_TensorListReserve, schema::PrimitiveType_TensorListSetItem,
   schema::PrimitiveType_TensorListStack};

-static const char *const kInnerOpNames[C10NUM] = {"Inner_ToFormat",         "Inner_GltextureToOpencl",
-                                                  "Inner_Identity",         "Inner_ShapeFusion",
-                                                  "Inner_GraphKernel",      "Inner_SplitReduceConcatFusion",
-                                                  "Inner_EncoderLayer",     "Inner_DecoderLayer",
-                                                  "Inner_UsePastEmbedding", "Inner_CustomGru"};
+static const char *const kInnerOpNames[C20NUM] = {"Inner_ToFormat",
+                                                  "Inner_GltextureToOpencl",
+                                                  "Inner_Identity",
+                                                  "Inner_ShapeFusion",
+                                                  "Inner_GraphKernel",
+                                                  "Inner_SplitReduceConcatFusion",
+                                                  "Inner_EncoderLayer",
+                                                  "PrimType_Inner_FseDecode",
+                                                  "Inner_DecoderLayer",
+                                                  "Inner_UsePastEmbedding",
+                                                  "Inner_CustomGru",
+                                                  "PrimType_Inner_CastGatherReduceFusion",
+                                                  "PrimType_Inner_ReduceConcatFusion",
+                                                  "PrimType_Inner_ThirdPartyModel",
+                                                  "PrimType_Inner_CustomMaskedFill",
+                                                  "PrimType_Inner_CustomTensorScatterMax",
+                                                  "PrimType_Inner_CustomIsInf",
+                                                  "PrimType_Inner_CustomGatherDGradV2"};
+
 int GetPrimitiveType(const void *primitive, int schema_version) {
   if (primitive == nullptr) {
     return -1;
diff --git a/mindspore/lite/src/litert/kernel/cpu/BUILD.gn b/mindspore/lite/src/litert/kernel/cpu/BUILD.gn
index 65065b5b..7b813314 100644
--- a/mindspore/lite/src/litert/kernel/cpu/BUILD.gn
+++ b/mindspore/lite/src/litert/kernel/cpu/BUILD.gn
@@ -85,6 +85,9 @@ cpu_kernel_sources = [
     "fp32/invert_permutation_fp32.cc",
     "fp32/l2_norm_fp32.cc",
     "fp32/lstm_fp32.cc",
+    "fp32/lstm_fp32_base.cc",
+    "fp32/lstm_mindir_fp32.cc",
+    "fp32/lstm_non_mindir_fp32.cc",
     "fp32/matmul_fp32_arm32.cc",
     "fp32/matmul_fp32_arm64.cc",
     "fp32/matmul_fp32_avx512.cc",
@@ -174,6 +177,9 @@ fp16_kernel_sources = [
   "fp16/instance_norm_fp16.cc",
   "fp16/layout_transform_fp16.cc",
   "fp16/lstm_fp16.cc",
+  "fp16/lstm_fp16_base.cc",
+  "fp16/lstm_mindir_fp16.cc",
+  "fp16/lstm_non_mindir_fp16.cc",
   "fp16/matmul_base_fp16.cc",
   "fp16/matmul_fp16.cc",
   "fp16/power_fp16.cc",
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/gru_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/gru_fp16.cc
index 232bbe44..89945e1c 100644
--- a/mindspore/lite/src/litert/kernel/cpu/fp16/gru_fp16.cc
+++ b/mindspore/lite/src/litert/kernel/cpu/fp16/gru_fp16.cc
@@ -100,10 +100,10 @@ int GruFp16CPUKernel::InitInputWeightBias() {
   }
   if (weight_g->data_type() == kNumberTypeFloat32) {
     PackLstmWeightFp32ToFp16(weight_g_ptr_, reinterpret_cast<float *>(weight_g->data()), weight_batch_,
-                             gru_param_->input_size_, gru_param_->hidden_size_, gru_param_->input_col_align_);
+                             gru_param_->input_size_, gru_param_->hidden_size_, gru_param_->input_col_align_, nullptr);
   } else if (weight_g->data_type() == kNumberTypeFloat16) {
     PackLstmWeightFp16(weight_g_ptr_, reinterpret_cast<float16_t *>(weight_g->data()), weight_batch_,
-                       gru_param_->input_size_, gru_param_->hidden_size_, gru_param_->input_col_align_);
+                       gru_param_->input_size_, gru_param_->hidden_size_, gru_param_->input_col_align_, nullptr);
   } else {
     MS_LOG(ERROR) << "Unsupported data type of weight_g tensor for gru.";
     return RET_ERROR;
@@ -120,10 +120,10 @@ int GruFp16CPUKernel::InitInputWeightBias() {
   memset(input_bias_, 0, weight_batch_ * gru_param_->input_col_align_ * sizeof(float16_t));
   if (bias->data_type() == kNumberTypeFloat32) {
     PackLstmBiasFp32ToFp16(input_bias_, reinterpret_cast<float *>(bias->data()), weight_batch_,
-                           gru_param_->hidden_size_, gru_param_->input_col_align_, gru_param_->bidirectional_);
+                           gru_param_->hidden_size_, gru_param_->input_col_align_, gru_param_->bidirectional_, nullptr);
   } else if (bias->data_type() == kNumberTypeFloat16) {
     PackLstmBiasFp16(input_bias_, reinterpret_cast<float16_t *>(bias->data()), weight_batch_, gru_param_->hidden_size_,
-                     gru_param_->input_col_align_, gru_param_->bidirectional_);
+                     gru_param_->input_col_align_, gru_param_->bidirectional_, nullptr);
   } else {
     MS_LOG(ERROR) << "Unsupported data type of bias tensor for gru.";
     return RET_ERROR;
@@ -148,10 +148,10 @@ int GruFp16CPUKernel::InitStateWeightBias() {
   if (!is_vec_) {
     if (weight_r->data_type() == kNumberTypeFloat32) {
       PackLstmWeightFp32ToFp16(weight_r_ptr_, reinterpret_cast<float *>(weight_r->data()), weight_batch_,
-                               gru_param_->hidden_size_, gru_param_->hidden_size_, gru_param_->state_col_align_);
+                               gru_param_->hidden_size_, gru_param_->hidden_size_, gru_param_->state_col_align_, nullptr);
     } else if (weight_r->data_type() == kNumberTypeFloat16) {
       PackLstmWeightFp16(weight_r_ptr_, reinterpret_cast<float16_t *>(weight_r->data()), weight_batch_,
-                         gru_param_->hidden_size_, gru_param_->hidden_size_, gru_param_->state_col_align_);
+                         gru_param_->hidden_size_, gru_param_->hidden_size_, gru_param_->state_col_align_, nullptr);
     } else {
       MS_LOG(ERROR) << "Unsupported data type of weight_r tensor for gru.";
       return RET_ERROR;
@@ -179,11 +179,11 @@ int GruFp16CPUKernel::InitStateWeightBias() {
   if (bias->data_type() == kNumberTypeFloat32) {
     auto state_bias_data = reinterpret_cast<float *>(bias->data()) + gate_num * gru_param_->hidden_size_;
     PackLstmBiasFp32ToFp16(state_bias_, state_bias_data, weight_batch_, gru_param_->hidden_size_,
-                           gru_param_->state_col_align_, gru_param_->bidirectional_);
+                           gru_param_->state_col_align_, gru_param_->bidirectional_, nullptr);
   } else if (bias->data_type() == kNumberTypeFloat16) {
     auto state_bias_data = reinterpret_cast<float16_t *>(bias->data()) + gate_num * gru_param_->hidden_size_;
     PackLstmBiasFp16(state_bias_, state_bias_data, weight_batch_, gru_param_->hidden_size_,
-                     gru_param_->state_col_align_, gru_param_->bidirectional_);
+                     gru_param_->state_col_align_, gru_param_->bidirectional_, nullptr);
   } else {
     MS_LOG(ERROR) << "Unsupported data type of bias tensor for gru.";
     return RET_ERROR;
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16.cc
index b583358a..bd99b791 100644
--- a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16.cc
+++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021 Huawei Technologies Co., Ltd
+ * Copyright 2021-2023 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,9 @@

 #include "src/litert/kernel/cpu/fp16/lstm_fp16.h"
 #include <vector>
-#include <cfloat>
-#include "schema/model_generated.h"
+#include "src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h"
+#include "src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h"
 #include "src/litert/kernel_registry.h"
-#include "include/errorcode.h"
-#include "nnacl/fp16/lstm_fp16.h"
-#include "nnacl/fp16/cast_fp16.h"
-#include "nnacl/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH;
 using mindspore::lite::KernelRegistrar;
@@ -31,389 +27,34 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_LSTM;

 namespace mindspore::kernel {
-void LstmFp16CPUKernel::FreeTmpBuffer() {
-  if (weight_i_ptr_ != nullptr) {
-    free(weight_i_ptr_);
-    weight_i_ptr_ = nullptr;
-  }
-  if (input_bias_ != nullptr) {
-    free(input_bias_);
-    input_bias_ = nullptr;
-  }
-  if (weight_h_ptr_ != nullptr) {
-    free(weight_h_ptr_);
-    weight_h_ptr_ = nullptr;
-  }
-  if (state_bias_ != nullptr) {
-    free(state_bias_);
-    state_bias_ = nullptr;
-  }
-  if (weight_project_ptr_ != nullptr) {
-    free(weight_project_ptr_);
-    weight_project_ptr_ = nullptr;
-  }
-  if (project_bias_ != nullptr) {
-    free(project_bias_);
-    project_bias_ = nullptr;
-  }
-}
-
-void LstmFp16CPUKernel::FreeRunBuffer() {
-  ms_context_->allocator->Free(buffer_[packed_input_index]);
-  ms_context_->allocator->Free(buffer_[input_gate_index]);
-  if (!is_vec_) {
-    ms_context_->allocator->Free(buffer_[packed_state_index]);
-  }
-  ms_context_->allocator->Free(buffer_[state_gate_index]);
-  if (!(lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON)) {
-    ms_context_->allocator->Free(buffer_[cell_state_index]);
-  }
-  if (!(lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON)) {
-    ms_context_->allocator->Free(buffer_[hidden_state_index]);
-  }
-}
-
-int LstmFp16CPUKernel::InitParam() {
-  auto input = in_tensors_.front();
-  std::vector<int> in_shape = input->shape();
-  lstm_param_->seq_len_ = in_shape.at(0);
-  lstm_param_->batch_ = in_shape.at(1);
-  lstm_param_->input_size_ = in_shape.at(kNHWC_W);
-
-  auto weight_i = in_tensors_.at(1);
-  std::vector<int> w_shape = weight_i->shape();
-  NNACL_CHECK_ZERO_RETURN_ERR(gate_num);
-  lstm_param_->hidden_size_ = w_shape.at(1) / gate_num;
-
-  auto weight_h = in_tensors_.at(C2NUM);
-  auto h_shape = weight_h->shape();
-  lstm_param_->project_size_ = h_shape.back();
-
-  const int twice = 2;
-  lstm_param_->output_step_ = lstm_param_->bidirectional_ ? twice * lstm_param_->batch_ * lstm_param_->hidden_size_
-                                                          : lstm_param_->batch_ * lstm_param_->hidden_size_;
-  weight_batch_ = lstm_param_->bidirectional_ ? twice * gate_num : gate_num;
-  lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, C16NUM);
-  lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C8NUM);
-
-  is_vec_ = lstm_param_->batch_ == 1;
-  lstm_param_->state_row_align_ = is_vec_ ? lstm_param_->batch_ : UP_ROUND(lstm_param_->batch_, C16NUM);
-  lstm_param_->state_col_align_ = is_vec_ ? lstm_param_->hidden_size_ : UP_ROUND(lstm_param_->hidden_size_, C8NUM);
-  return RET_OK;
-}
-
-int LstmFp16CPUKernel::InitInputWeightBias() {
-  // malloc and init input * weight right matrix buffer
-  // input -- row: seq_len * batch; col: input_size
-  // weight -- row: hidden_size; col: input_size, need transpose
-  // result -- row: seq_len * batch; col: hidden_size
-  auto weight_i = in_tensors_.at(1);
-  auto weight_i_data = weight_i->data();
-  CHECK_NULL_RETURN(weight_i_data);
-  weight_i_ptr_ = reinterpret_cast<float16_t *>(
-    malloc(weight_batch_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float16_t)));
-  if (weight_i_ptr_ == nullptr) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc weight_i_ptr_ error.";
-    return RET_ERROR;
-  }
-  if (weight_i->data_type() == kNumberTypeFloat32) {
-    PackLstmWeightFp32ToFp16(weight_i_ptr_, reinterpret_cast<float *>(weight_i_data), weight_batch_,
-                             lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_);
-  } else if (weight_i->data_type() == kNumberTypeFloat16) {
-    PackLstmWeightFp16(weight_i_ptr_, reinterpret_cast<float16_t *>(weight_i_data), weight_batch_,
-                       lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_);
+namespace {
+constexpr size_t kMindirInputTensorNum = 4;
+}  // namespace
+
+LiteKernel *LstmFp16KernelCreator(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                                  OpParameter *parameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc) {
+  if (parameter == nullptr) {
+    MS_LOG(ERROR) << "parameter is nullptr.";
+    return nullptr;
+  }
+  if (desc.data_type == kTypeUnknown) {
+    MS_LOG(WARNING) << "desc data_type is unknown.";
+  }
+  LiteKernel *kernel{nullptr};
+  if (inputs.size() == kMindirInputTensorNum) {
+    kernel = new (std::nothrow)
+      LstmMindirFp16CPUKernel(parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
   } else {
-    MS_LOG(ERROR) << "Unsupported data type of weight_i tensor for lstm.";
-    return RET_ERROR;
-  }
-
-  // input bias
-  auto bias = in_tensors_.at(FOURTH_INPUT);
-  auto bias_data = bias->data();
-  CHECK_NULL_RETURN(bias_data);
-  input_bias_ =
-    reinterpret_cast<float16_t *>(malloc(weight_batch_ * lstm_param_->input_col_align_ * sizeof(float16_t)));
-  if (input_bias_ == nullptr) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc input_bias_ error.";
-    return RET_ERROR;
-  }
-  memset(input_bias_, 0, weight_batch_ * lstm_param_->input_col_align_ * sizeof(float16_t));
-  if (bias->data_type() == kNumberTypeFloat32) {
-    PackLstmBiasFp32ToFp16(input_bias_, reinterpret_cast<float *>(bias_data), weight_batch_, lstm_param_->hidden_size_,
-                           lstm_param_->input_col_align_, lstm_param_->bidirectional_);
-  } else if (bias->data_type() == kNumberTypeFloat16) {
-    PackLstmBiasFp16(input_bias_, reinterpret_cast<float16_t *>(bias_data), weight_batch_, lstm_param_->hidden_size_,
-                     lstm_param_->input_col_align_, lstm_param_->bidirectional_);
-  } else {
-    MS_LOG(ERROR) << "Unsupported data type of bias tensor for lstm.";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int LstmFp16CPUKernel::InitStateWeightBias() {
-  // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times.
-  // state -- row: batch; col: hidden_size
-  // weight -- row: hidden_size; col: hidden_size, need transpose
-  // result -- row: batch; col: hidden_size
-  auto weight_h = in_tensors_.at(THIRD_INPUT);
-  auto weight_h_data = weight_h->data();
-  CHECK_NULL_RETURN(weight_h_data);
-  weight_h_ptr_ = reinterpret_cast<float16_t *>(
-    malloc(weight_batch_ * lstm_param_->state_col_align_ * lstm_param_->project_size_ * sizeof(float16_t)));
-  if (weight_h_ptr_ == nullptr) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc weight_h_ptr_ error.";
-    return RET_ERROR;
-  }
-
-  if (!is_vec_) {
-    if (weight_h->data_type() == kNumberTypeFloat32) {
-      PackLstmWeightFp32ToFp16(weight_h_ptr_, reinterpret_cast<float *>(weight_h_data), weight_batch_,
-                               lstm_param_->project_size_, lstm_param_->hidden_size_, lstm_param_->state_col_align_);
-    } else if (weight_h->data_type() == kNumberTypeFloat16) {
-      PackLstmWeightFp16(weight_h_ptr_, reinterpret_cast<float16_t *>(weight_h_data), weight_batch_,
-                         lstm_param_->project_size_, lstm_param_->hidden_size_, lstm_param_->state_col_align_);
-    } else {
-      MS_LOG(ERROR) << "Unsupported data type of weight_h tensor for lstm.";
-      return RET_ERROR;
-    }
-  } else {
-    if (weight_h->data_type() == kNumberTypeFloat32) {
-      Float32ToFloat16(reinterpret_cast<float *>(weight_h_data), weight_h_ptr_, weight_h->ElementsNum());
-    } else if (weight_h->data_type() == kNumberTypeFloat16) {
-      memcpy(weight_h_ptr_, reinterpret_cast<float16_t *>(weight_h_data), weight_h->Size());
-    } else {
-      MS_LOG(ERROR) << "Unsupported data type of weight_h tensor for lstm.";
-      return RET_ERROR;
-    }
-  }
-
-  // state bias
-  auto bias = in_tensors_.at(FOURTH_INPUT);
-  auto bias_data = bias->data();
-  CHECK_NULL_RETURN(bias_data);
-  state_bias_ =
-    reinterpret_cast<float16_t *>(malloc(weight_batch_ * lstm_param_->state_col_align_ * sizeof(float16_t)));
-  if (state_bias_ == nullptr) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_bias_ error.";
-    return RET_ERROR;
-  }
-  memset(state_bias_, 0, weight_batch_ * lstm_param_->state_col_align_ * sizeof(float16_t));
-  if (bias->data_type() == kNumberTypeFloat32) {
-    auto state_bias_data = reinterpret_cast<float *>(bias_data) + gate_num * lstm_param_->hidden_size_;
-    PackLstmBiasFp32ToFp16(state_bias_, state_bias_data, weight_batch_, lstm_param_->hidden_size_,
-                           lstm_param_->state_col_align_, lstm_param_->bidirectional_);
-  } else if (bias->data_type() == kNumberTypeFloat16) {
-    auto state_bias_data = reinterpret_cast<float16_t *>(bias_data) + gate_num * lstm_param_->hidden_size_;
-    PackLstmBiasFp16(state_bias_, state_bias_data, weight_batch_, lstm_param_->hidden_size_,
-                     lstm_param_->state_col_align_, lstm_param_->bidirectional_);
-  } else {
-    MS_LOG(ERROR) << "Unsupported data type of bias tensor for lstm.";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int LstmFp16CPUKernel::InitProjectWeight() {
-  if (in_tensors_.size() < C7NUM) {
-    return RET_OK;
-  }
-  auto weight_pro = in_tensors_.at(SEVENTH_INPUT);
-  auto shape = weight_pro->shape();
-  if (shape.size() != C3NUM) {
-    MS_LOG(ERROR) << "Project-weight's shape must be 3D.";
-    return RET_ERROR;
-  }
-  auto weight_pro_data = weight_pro->data();
-  CHECK_NULL_RETURN(weight_pro_data);
-  int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
-  if (shape[0] != batch) {
-    MS_LOG(ERROR) << "Project-weight's shape[0] must be 1(bidirectional=false) or 2(bidirectional=true).";
-    return RET_ERROR;
+    kernel = new (std::nothrow)
+      LstmNonMindirFp16CPUKernel(parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
   }
-  int pro_col_align = is_vec_ ? lstm_param_->project_size_ : UP_ROUND(lstm_param_->project_size_, C8NUM);
-  weight_project_ptr_ =
-    reinterpret_cast<float16_t *>(malloc(batch * lstm_param_->hidden_size_ * pro_col_align * sizeof(float16_t)));
-  if (weight_project_ptr_ == nullptr) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc weight_project_ptr_ error.";
-    return RET_ERROR;
-  }
-
-  if (!is_vec_) {
-    if (weight_pro->data_type() == kNumberTypeFloat32) {
-      PackLstmWeightFp32ToFp16(weight_project_ptr_, reinterpret_cast<float *>(weight_pro_data), batch,
-                               lstm_param_->hidden_size_, lstm_param_->project_size_, pro_col_align);
-    } else if (weight_pro->data_type() == kNumberTypeFloat16) {
-      PackLstmWeightFp16(weight_project_ptr_, reinterpret_cast<float16_t *>(weight_pro_data), batch,
-                         lstm_param_->hidden_size_, lstm_param_->project_size_, pro_col_align);
-    } else {
-      MS_LOG(ERROR) << "Unsupported data type of weight_project tensor for lstm.";
-      return RET_ERROR;
-    }
-  } else {
-    if (weight_pro->data_type() == kNumberTypeFloat32) {
-      Float32ToFloat16(reinterpret_cast<float *>(weight_pro_data), weight_project_ptr_, weight_pro->ElementsNum());
-    } else if (weight_pro->data_type() == kNumberTypeFloat16) {
-      memcpy(weight_project_ptr_, weight_pro_data, weight_pro->Size());
-    } else {
-      MS_LOG(ERROR) << "Unsupported data type of weight_project tensor for lstm.";
-      return RET_ERROR;
-    }
-  }
-  size_t bias_size = UP_ROUND(lstm_param_->project_size_, C8NUM) * sizeof(float16_t);
-  project_bias_ = reinterpret_cast<float16_t *>(malloc(bias_size));
-  if (project_bias_ == nullptr) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc project_bias_ error.";
-    return RET_ERROR;
-  }
-  (void)memset(project_bias_, 0, bias_size);
-  return RET_OK;
-}
-
-int LstmFp16CPUKernel::Prepare() {
-  CHECK_LESS_RETURN(in_tensors_.size(), C6NUM);
-  for (size_t i = 0; i < in_tensors_.size(); i++) {
-    CHECK_NULL_RETURN(in_tensors_.at(i));
-  }
-  CHECK_LESS_RETURN(out_tensors_.size(), C3NUM);
-  for (size_t i = 0; i < out_tensors_.size(); i++) {
-    CHECK_NULL_RETURN(out_tensors_.at(i));
-  }
-  CHECK_NULL_RETURN(lstm_param_);
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-  return ReSize();
-}
-
-int LstmFp16CPUKernel::ReSize() {
-  auto ret = InitParam();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Lstm fp16 InitParam error.";
-    return RET_ERROR;
-  }
-
-  FreeTmpBuffer();
-  ret = InitInputWeightBias();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Lstm fp16 InitInputWeightBias error.";
-    FreeTmpBuffer();
-    return RET_ERROR;
-  }
-
-  ret = InitStateWeightBias();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Lstm fp16 InitStateWeightBias error.";
-    FreeTmpBuffer();
-    return RET_ERROR;
-  }
-
-  ret = InitProjectWeight();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Lstm fp16 InitProjectWeight error.";
-    FreeTmpBuffer();
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
-int LstmFp16CPUKernel::MallocRunBuffer() {
-  for (int i = 0; i < C7NUM; i++) {
-    buffer_[i] = nullptr;
-  }
-  buffer_[packed_input_index] = reinterpret_cast<float16_t *>(
-    ms_context_->allocator->Malloc(lstm_param_->input_row_align_ * lstm_param_->input_size_ * sizeof(float16_t)));
-  if (buffer_[packed_input_index] == nullptr) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc input * weight left matirx error.";
-    return RET_ERROR;
-  }
-
-  buffer_[input_gate_index] = reinterpret_cast<float16_t *>(ms_context_->allocator->Malloc(
-    gate_num * lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t)));
-  if (buffer_[input_gate_index] == nullptr) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state * weight left matirx error.";
-    return RET_ERROR;
-  }
-
-  if (!is_vec_) {
-    buffer_[packed_state_index] = reinterpret_cast<float16_t *>(
-      ms_context_->allocator->Malloc(lstm_param_->state_row_align_ * lstm_param_->project_size_ * sizeof(float16_t)));
-    if (buffer_[packed_state_index] == nullptr) {
-      MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state * weight left matirx error.";
-      return RET_ERROR;
-    }
-  }
-
-  buffer_[state_gate_index] = reinterpret_cast<float16_t *>(
-    ms_context_->allocator->Malloc(gate_num * lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t)));
-  if (buffer_[state_gate_index] == nullptr) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state gate buffer_ error.";
-    return RET_ERROR;
-  }
-
-  if (!(lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON)) {
-    int buffer_size = lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t);
-    buffer_[cell_state_index] = reinterpret_cast<float16_t *>(ms_context_->allocator->Malloc(buffer_size));
-    if (buffer_[cell_state_index] == nullptr) {
-      MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_buffer for cell error.";
-      return RET_ERROR;
-    }
-  }
-  if (!(lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON)) {
-    int buffer_size = lstm_param_->batch_ * lstm_param_->project_size_ * sizeof(float16_t);
-    buffer_[hidden_state_index] = reinterpret_cast<float16_t *>(ms_context_->allocator->Malloc(buffer_size));
-    if (buffer_[hidden_state_index] == nullptr) {
-      MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_buffer for hidden error.";
-      return RET_ERROR;
-    }
-  }
-  if (!is_vec_ && in_tensors_.size() == C7NUM) {
-    buffer_[project_input_index] = reinterpret_cast<float16_t *>(
-      ms_context_->allocator->Malloc(lstm_param_->state_row_align_ * lstm_param_->hidden_size_ * sizeof(float16_t)));
-    if (buffer_[project_input_index] == nullptr) {
-      MS_LOG(ERROR) << "LstmFp16CPUKernel malloc project_buffer for hidden error.";
-      return RET_ERROR;
-    }
-  }
-  return RET_OK;
-}
-
-int LstmFp16CPUKernel::Run() {
-  auto input = in_tensors_.at(0);
-  auto input_ptr = reinterpret_cast<float16_t *>(input->data());
-  CHECK_NULL_RETURN(input_ptr);
-  auto output = out_tensors_.at(0);
-  auto output_ptr = reinterpret_cast<float16_t *>(output->data());
-  CHECK_NULL_RETURN(output_ptr);
-
-  auto hidden_state = in_tensors_.at(FIFTH_INPUT);
-  CHECK_NULL_RETURN(hidden_state->data());
-  auto cell_state = in_tensors_.at(SIXTH_INPUT);
-  CHECK_NULL_RETURN(cell_state->data());
-
-  auto output_hidden_state = out_tensors_[1];
-  CHECK_NULL_RETURN(output_hidden_state->data());
-  memcpy(output_hidden_state->data(), hidden_state->data(), hidden_state->ElementsNum() * sizeof(float16_t));
-  auto output_cell_state = out_tensors_[THIRD_INPUT];
-  CHECK_NULL_RETURN(output_cell_state->data());
-  memcpy(output_cell_state->data(), cell_state->data(), cell_state->ElementsNum() * sizeof(float16_t));
-
-  auto ret = MallocRunBuffer();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LstmFp16CPUKernel MallocRunBuffer error.";
-    FreeRunBuffer();
-    return RET_ERROR;
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel: " << parameter->name_ << "is nullptr.";
+    free(parameter);
+    return nullptr;
   }
-  CHECK_NULL_RETURN(weight_i_ptr_);
-  CHECK_NULL_RETURN(weight_h_ptr_);
-  CHECK_NULL_RETURN(input_bias_);
-  CHECK_NULL_RETURN(state_bias_);
-  LstmFp16(output_ptr, input_ptr, weight_i_ptr_, weight_h_ptr_, input_bias_, state_bias_, weight_project_ptr_,
-           project_bias_, reinterpret_cast<float16_t *>(output_hidden_state->data()),
-           reinterpret_cast<float16_t *>(output_cell_state->data()), buffer_, lstm_param_);
-  FreeRunBuffer();
-  return RET_OK;
+  return kernel;
 }

-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LSTM, LiteKernelCreator<LstmFp16CPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LSTM, LstmFp16KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.cc
new file mode 100644
index 00000000..767fdef3
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.cc
@@ -0,0 +1,270 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/cpu/fp16/lstm_fp16_base.h"
+#include <cfloat>
+#include "nnacl/fp16/lstm_fp16.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+namespace {
+constexpr int kGateNum = 4;
+constexpr int kTempInputBufferIndex = 0;
+constexpr int kTempInputGateBufferIndex = 1;
+constexpr int kTempStateBufferIndex = 2;
+constexpr int kTempStateGateBufferIndex = 3;
+constexpr int kTempCellStateBufferIndex = 4;
+constexpr int kTempHiddenStateBufferIndex = 5;
+constexpr int kTempProjectInputBufferIndex = 6;
+}  // namespace
+
+LstmFp16BaseCPUKernel::~LstmFp16BaseCPUKernel() { FreePackBuffer(); }
+
+int LstmFp16BaseCPUKernel::Prepare() {
+  for (size_t i = 0; i < in_tensors_.size(); ++i) {
+    CHECK_NULL_RETURN(in_tensors_[i]);
+  }
+  CHECK_LESS_RETURN(out_tensors_.size(), C3NUM);
+  for (size_t i = 0; i < out_tensors_.size(); ++i) {
+    CHECK_NULL_RETURN(out_tensors_[i]);
+  }
+  CHECK_NULL_RETURN(lstm_param_);
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int LstmFp16BaseCPUKernel::ReSize() {
+  auto ret = InitParam();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LstmFp16 InitParam failed.";
+    return RET_ERROR;
+  }
+  if (running_pack_) {
+    return RET_OK;
+  }
+  return PackWeightAndBias();
+}
+
+int LstmFp16BaseCPUKernel::Run() {
+  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_[FIRST_INPUT]->data());
+  CHECK_NULL_RETURN(input_ptr);
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_[FIRST_INPUT]->data());
+  CHECK_NULL_RETURN(output_ptr);
+
+  auto hidden_init = in_tensors_[hidden_init_index_]->data();
+  CHECK_NULL_RETURN(hidden_init);
+  auto cell_init = in_tensors_[cell_init_index_]->data();
+  CHECK_NULL_RETURN(cell_init);
+
+  auto output_hidden = out_tensors_[SECOND_INPUT]->data();
+  CHECK_NULL_RETURN(output_hidden);
+  (void)memcpy(output_hidden, hidden_init, in_tensors_[hidden_init_index_]->ElementsNum() * sizeof(float16_t));
+  auto output_cell = out_tensors_[THIRD_INPUT]->data();
+  CHECK_NULL_RETURN(output_cell);
+  (void)memcpy(output_cell, cell_init, in_tensors_[cell_init_index_]->ElementsNum() * sizeof(float16_t));
+
+  if (running_pack_) {
+    auto ret = PackWeightAndBias();
+    if (ret != lite::RET_OK) {
+      MS_LOG(ERROR) << "LstmFp16 PackWeightAndBias failed.";
+      return ret;
+    }
+  }
+  auto ret = MallocRunBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LstmFp16CPUKernel MallocRunBuffer error.";
+    FreeRunBuffer();
+    if (running_pack_) {
+      FreePackBuffer();
+    }
+    return RET_ERROR;
+  }
+  LstmFp16(output_ptr, input_ptr, weight_i_ptr_, weight_h_ptr_, input_bias_, state_bias_, weight_project_ptr_,
+           project_bias_, reinterpret_cast<float16_t *>(output_hidden), reinterpret_cast<float16_t *>(output_cell),
+           running_buffer_, lstm_param_);
+  FreeRunBuffer();
+  if (running_pack_) {
+    FreePackBuffer();
+  }
+  return RET_OK;
+}
+
+int LstmFp16BaseCPUKernel::InitParam() {
+  auto in_shape = in_tensors_[FIRST_INPUT]->shape();
+  MS_CHECK_TRUE_MSG(in_shape.size() == C3NUM, lite::RET_INPUT_TENSOR_ERROR,
+                    "The dims of LSTM's first input must be 3.");
+  lstm_param_->seq_len_ = in_shape[0];
+  lstm_param_->batch_ = in_shape[1];
+  lstm_param_->input_size_ = in_shape.back();
+
+  auto h_init_shape = in_tensors_.at(hidden_init_index_)->shape();
+  auto c_init_shape = in_tensors_.at(cell_init_index_)->shape();
+  lstm_param_->hidden_size_ = c_init_shape.back();
+  lstm_param_->output_size_ = h_init_shape.back();
+
+  lstm_param_->output_step_ = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->output_size_
+                                                          : lstm_param_->batch_ * lstm_param_->output_size_;
+  weight_segment_num_ = lstm_param_->bidirectional_ ? C2NUM * kGateNum : kGateNum;
+#ifdef ENABLE_ARM64
+  lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, C1NUM);
+  lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C4NUM);
+
+  lstm_param_->state_row_align_ = UP_ROUND(lstm_param_->batch_, C1NUM);
+  lstm_param_->state_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C4NUM);
+  lstm_param_->proj_col_align_ = UP_ROUND(lstm_param_->output_size_, C4NUM);
+  weight_need_pack_ = true;
+#else
+  lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, C16NUM);
+  lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C8NUM);
+
+  lstm_param_->state_row_align_ =
+    lstm_param_->batch_ == 1 ? lstm_param_->batch_ : UP_ROUND(lstm_param_->batch_, C16NUM);
+  lstm_param_->state_col_align_ =
+    lstm_param_->batch_ == 1 ? lstm_param_->hidden_size_ : UP_ROUND(lstm_param_->hidden_size_, C8NUM);
+  lstm_param_->proj_col_align_ =
+    lstm_param_->batch_ == 1 ? lstm_param_->output_size_ : UP_ROUND(lstm_param_->output_size_, C8NUM);
+  weight_need_pack_ = lstm_param_->batch_ != 1;
+#endif
+  return RET_OK;
+}
+
+int LstmFp16BaseCPUKernel::PackWeightAndBias() {
+  FreePackBuffer();
+  auto ret = InitInputWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LstmFp16 InitInputWeightBias failed.";
+    FreePackBuffer();
+    return RET_ERROR;
+  }
+
+  ret = InitStateWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LstmFp16 InitStateWeightBias failed.";
+    FreePackBuffer();
+    return RET_ERROR;
+  }
+
+  ret = InitProjectWeight();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LstmFp16 InitProjectWeight failed.";
+    FreePackBuffer();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+void LstmFp16BaseCPUKernel::FreePackBuffer() {
+  for (auto buffer : pack_buffer_) {
+    if (buffer) {
+      free(buffer);
+    }
+  }
+  pack_buffer_.clear();
+}
+
+int LstmFp16BaseCPUKernel::MallocRunBuffer() {
+  for (int i = 0; i < C7NUM; i++) {
+    running_buffer_[i] = nullptr;
+  }
+  bool need_pack_input = true;
+#ifdef ENABLE_ARM64
+  need_pack_input = lstm_param_->seq_len_ * lstm_param_->batch_ >= C4NUM;
+#endif
+  if (need_pack_input) {
+    running_buffer_[kTempInputBufferIndex] = reinterpret_cast<float16_t *>(
+      ms_context_->allocator->Malloc(lstm_param_->input_row_align_ * lstm_param_->input_size_ * sizeof(float16_t)));
+    if (running_buffer_[kTempInputBufferIndex] == nullptr) {
+      MS_LOG(ERROR) << "LstmFp16CPUKernel malloc input * weight left matirx error.";
+      return RET_ERROR;
+    }
+  }
+
+  running_buffer_[kTempInputGateBufferIndex] = reinterpret_cast<float16_t *>(ms_context_->allocator->Malloc(
+    kGateNum * lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t)));
+  if (running_buffer_[kTempInputGateBufferIndex] == nullptr) {
+    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state * weight left matirx error.";
+    return RET_ERROR;
+  }
+
+  need_pack_input = lstm_param_->batch_ != 1;
+#ifdef ENABLE_ARM64
+  need_pack_input = lstm_param_->batch_ >= C4NUM;
+#endif
+  if (need_pack_input) {
+    running_buffer_[kTempStateBufferIndex] = reinterpret_cast<float16_t *>(
+      ms_context_->allocator->Malloc(lstm_param_->state_row_align_ * lstm_param_->output_size_ * sizeof(float16_t)));
+    if (running_buffer_[kTempStateBufferIndex] == nullptr) {
+      MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state * weight left matirx error.";
+      return RET_ERROR;
+    }
+  }
+
+  running_buffer_[kTempStateGateBufferIndex] = reinterpret_cast<float16_t *>(
+    ms_context_->allocator->Malloc(kGateNum * lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t)));
+  if (running_buffer_[kTempStateGateBufferIndex] == nullptr) {
+    MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state gate buffer_ error.";
+    return RET_ERROR;
+  }
+
+  if (!(lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON)) {
+    int buffer_size = lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t);
+    running_buffer_[kTempCellStateBufferIndex] =
+      reinterpret_cast<float16_t *>(ms_context_->allocator->Malloc(buffer_size));
+    if (running_buffer_[kTempCellStateBufferIndex] == nullptr) {
+      MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_buffer for cell error.";
+      return RET_ERROR;
+    }
+  }
+  if (!(lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON)) {
+    int buffer_size = lstm_param_->batch_ * lstm_param_->output_size_ * sizeof(float16_t);
+    running_buffer_[kTempHiddenStateBufferIndex] =
+      reinterpret_cast<float16_t *>(ms_context_->allocator->Malloc(buffer_size));
+    if (running_buffer_[kTempHiddenStateBufferIndex] == nullptr) {
+      MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_buffer for hidden error.";
+      return RET_ERROR;
+    }
+  }
+
+  if (need_pack_input && in_tensors_.size() == C7NUM) {
+    running_buffer_[kTempProjectInputBufferIndex] = reinterpret_cast<float16_t *>(
+      ms_context_->allocator->Malloc(lstm_param_->state_row_align_ * lstm_param_->hidden_size_ * sizeof(float16_t)));
+    if (running_buffer_[kTempProjectInputBufferIndex] == nullptr) {
+      MS_LOG(ERROR) << "LstmFp16CPUKernel malloc project_buffer for hidden error.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+void LstmFp16BaseCPUKernel::FreeRunBuffer() {
+  ms_context_->allocator->Free(running_buffer_[kTempInputBufferIndex]);
+  ms_context_->allocator->Free(running_buffer_[kTempInputGateBufferIndex]);
+  if (lstm_param_->batch_ != 1) {
+    ms_context_->allocator->Free(running_buffer_[kTempStateBufferIndex]);
+  }
+  ms_context_->allocator->Free(running_buffer_[kTempStateGateBufferIndex]);
+  if (!(lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON)) {
+    ms_context_->allocator->Free(running_buffer_[kTempCellStateBufferIndex]);
+  }
+  if (!(lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON)) {
+    ms_context_->allocator->Free(running_buffer_[kTempHiddenStateBufferIndex]);
+  }
+}
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.h b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.h
new file mode 100644
index 00000000..0bcb9e94
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_FP16_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_FP16_BASE_H_
+
+#include <vector>
+#include "src/litert/lite_kernel.h"
+#include "nnacl/lstm_parameter.h"
+
+namespace mindspore::kernel {
+class LstmFp16BaseCPUKernel : public LiteKernel {
+ public:
+  LstmFp16BaseCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                        const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : LiteKernel(parameter, inputs, outputs, ctx) {
+    lstm_param_ = reinterpret_cast<LstmParameter *>(op_parameter_);
+  }
+
+  ~LstmFp16BaseCPUKernel() override;
+
+  int Prepare() override;
+  int ReSize() override;
+  int Run() override;
+
+ protected:
+  virtual int InitInputWeightBias() = 0;
+  virtual int InitStateWeightBias() = 0;
+  virtual int InitProjectWeight() = 0;
+
+  bool running_pack_{false};
+  bool weight_need_pack_{false};
+  int hidden_init_index_{0};
+  int cell_init_index_{0};
+  int weight_segment_num_{0};
+  float16_t *weight_i_ptr_{nullptr};
+  float16_t *weight_h_ptr_{nullptr};
+  float16_t *weight_project_ptr_{nullptr};
+  float16_t *input_bias_{nullptr};
+  float16_t *state_bias_{nullptr};
+  float16_t *project_bias_{nullptr};
+  LstmParameter *lstm_param_{nullptr};
+  float16_t *running_buffer_[C7NUM] = {nullptr};
+  std::vector<void *> pack_buffer_;
+
+ private:
+  int PackWeightAndBias();
+  int InitParam();
+  void FreePackBuffer();
+  void FreeRunBuffer();
+  int MallocRunBuffer();
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_FP16_BASE_H_
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.cc
new file mode 100644
index 00000000..cf4071eb
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.cc
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h"
+
+namespace mindspore::kernel {
+namespace {
+constexpr size_t kMindirInputTensorNum = 4;
+}  // namespace
+
+int LstmMindirFp16CPUKernel::Prepare() {
+  CHECK_NOT_EQUAL_RETURN(in_tensors_.size(), kMindirInputTensorNum);
+  running_pack_ = trainable_ || !in_tensors_[FOURTH_INPUT]->IsConst();
+  return LstmFp16BaseCPUKernel::Prepare();
+}
+
+int LstmMindirFp16CPUKernel::InitInputWeightBias() { return lite::RET_NOT_SUPPORT; }
+
+int LstmMindirFp16CPUKernel::InitStateWeightBias() { return lite::RET_NOT_SUPPORT; }
+
+int LstmMindirFp16CPUKernel::InitProjectWeight() { return lite::RET_NOT_SUPPORT; }
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h
new file mode 100644
index 00000000..bd8500d0
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_MINDIR_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_MINDIR_FP16_H_
+
+#include <vector>
+#include "src/litert/kernel/cpu/fp16/lstm_fp16_base.h"
+
+namespace mindspore::kernel {
+/*
+ * 1. LSTM without project, output_size = hidden_size
+ *    h_init: second input, shape is [bidirectional, batch_size, hidden_size]
+ *    c_init: third input, shape is [bidirectional, batch_size, hidden_size]
+ *    weight_bias: forth input, weight_ih + weight_hh + bias, the gate order is IFGO
+ *
+ * 2. LSTM with project, output_size = project_size
+ *    don't support
+ *    h_init: second input, shape is [bidirectional, batch_size, hidden_size]
+ *    c_init: third input, shape is [bidirectional, batch_size, hidden_size]
+ *    weight_bias: forth input, weight_ih + weight_hh + proj + bias, the gate order is IFGO
+ */
+class LstmMindirFp16CPUKernel : public LstmFp16BaseCPUKernel {
+ public:
+  LstmMindirFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                          const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : LstmFp16BaseCPUKernel(parameter, inputs, outputs, ctx) {
+    hidden_init_index_ = SECOND_INPUT;
+    cell_init_index_ = THIRD_INPUT;
+  }
+
+  ~LstmMindirFp16CPUKernel() override = default;
+
+  int Prepare() override;
+
+ protected:
+  int InitInputWeightBias() override;
+  int InitStateWeightBias() override;
+  int InitProjectWeight() override;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_MINDIR_FP16_H_
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.cc
new file mode 100644
index 00000000..473fe9b0
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.cc
@@ -0,0 +1,194 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h"
+#include "nnacl/fp16/lstm_fp16.h"
+#include "nnacl/fp16/cast_fp16.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+namespace {
+constexpr int kGateNum = 4;
+constexpr size_t kInputTensorNumMin = 6;
+}  // namespace
+
+int LstmNonMindirFp16CPUKernel::Prepare() {
+  CHECK_LESS_RETURN(in_tensors_.size(), kInputTensorNumMin);
+  running_pack_ = train_mode_;
+  for (size_t i = 1; i <= FOURTH_INPUT; ++i) {
+    running_pack_ = running_pack_ || !in_tensors_[i]->IsConst();
+  }
+  return LstmFp16BaseCPUKernel::Prepare();
+}
+
+int LstmNonMindirFp16CPUKernel::InitInputWeightBias() {
+  // malloc and init input * weight right matrix buffer
+  // input -- row: seq_len * batch; col: input_size
+  // weight -- row: hidden_size; col: input_size, need transpose
+  // result -- row: seq_len * batch; col: hidden_size
+  auto weight_i = in_tensors_.at(1);
+  auto weight_i_data = weight_i->data();
+  CHECK_NULL_RETURN(weight_i_data);
+  weight_i_ptr_ = reinterpret_cast<float16_t *>(
+    malloc(weight_segment_num_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float16_t)));
+  MS_CHECK_TRUE_MSG(weight_i_ptr_ != nullptr, lite::RET_NULL_PTR,
+                    "LstmNonMindirCPUKernel malloc weight_i_ptr_ failed.");
+  pack_buffer_.push_back(weight_i_ptr_);
+  if (weight_i->data_type() == kNumberTypeFloat32) {
+    PackLstmWeightFp32ToFp16(weight_i_ptr_, reinterpret_cast<float *>(weight_i_data), weight_segment_num_,
+                             lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_,
+                             nullptr);
+  } else if (weight_i->data_type() == kNumberTypeFloat16) {
+    PackLstmWeightFp16(weight_i_ptr_, reinterpret_cast<float16_t *>(weight_i_data), weight_segment_num_,
+                       lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_, nullptr);
+  } else {
+    MS_LOG(ERROR) << "Unsupported data type of weight_i tensor for lstm.";
+    return RET_ERROR;
+  }
+
+  // input bias
+  auto bias = in_tensors_.at(FOURTH_INPUT);
+  auto bias_data = bias->data();
+  CHECK_NULL_RETURN(bias_data);
+  input_bias_ =
+    reinterpret_cast<float16_t *>(malloc(weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float16_t)));
+  MS_CHECK_TRUE_MSG(input_bias_ != nullptr, lite::RET_NULL_PTR, "LstmNonMindirCPUKernel malloc input_bias_ failed.");
+  pack_buffer_.push_back(input_bias_);
+  (void)memset(input_bias_, 0, weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float16_t));
+  if (bias->data_type() == kNumberTypeFloat32) {
+    PackLstmBiasFp32ToFp16(input_bias_, reinterpret_cast<float *>(bias_data), weight_segment_num_,
+                           lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_,
+                           nullptr);
+  } else if (bias->data_type() == kNumberTypeFloat16) {
+    PackLstmBiasFp16(input_bias_, reinterpret_cast<float16_t *>(bias_data), weight_segment_num_,
+                     lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_, nullptr);
+  } else {
+    MS_LOG(ERROR) << "Unsupported data type of bias tensor for lstm.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int LstmNonMindirFp16CPUKernel::InitStateWeightBias() {
+  // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times.
+  // state -- row: batch; col: hidden_size
+  // weight -- row: hidden_size; col: hidden_size, need transpose
+  // result -- row: batch; col: hidden_size
+  auto weight_h = in_tensors_.at(THIRD_INPUT);
+  auto weight_h_data = weight_h->data();
+  CHECK_NULL_RETURN(weight_h_data);
+  weight_h_ptr_ = reinterpret_cast<float16_t *>(
+    malloc(weight_segment_num_ * lstm_param_->state_col_align_ * lstm_param_->output_size_ * sizeof(float16_t)));
+  MS_CHECK_TRUE_MSG(weight_h_ptr_ != nullptr, lite::RET_NULL_PTR,
+                    "LstmNonMindirCPUKernel malloc weight_h_ptr_ failed.");
+
+  if (weight_need_pack_) {
+    if (weight_h->data_type() == kNumberTypeFloat32) {
+      PackLstmWeightFp32ToFp16(weight_h_ptr_, reinterpret_cast<float *>(weight_h_data), weight_segment_num_,
+                               lstm_param_->output_size_, lstm_param_->hidden_size_, lstm_param_->state_col_align_,
+                               nullptr);
+    } else if (weight_h->data_type() == kNumberTypeFloat16) {
+      PackLstmWeightFp16(weight_h_ptr_, reinterpret_cast<float16_t *>(weight_h_data), weight_segment_num_,
+                         lstm_param_->output_size_, lstm_param_->hidden_size_, lstm_param_->state_col_align_, nullptr);
+    } else {
+      MS_LOG(ERROR) << "Unsupported data type of weight_h tensor for lstm.";
+      return RET_ERROR;
+    }
+  } else {
+    if (weight_h->data_type() == kNumberTypeFloat32) {
+      Float32ToFloat16(reinterpret_cast<float *>(weight_h_data), weight_h_ptr_, weight_h->ElementsNum());
+    } else if (weight_h->data_type() == kNumberTypeFloat16) {
+      (void)memcpy(weight_h_ptr_, reinterpret_cast<float16_t *>(weight_h_data), weight_h->Size());
+    } else {
+      MS_LOG(ERROR) << "Unsupported data type of weight_h tensor for lstm.";
+      return RET_ERROR;
+    }
+  }
+
+  // state bias
+  auto bias = in_tensors_[FOURTH_INPUT];
+  auto bias_data = bias->data();
+  CHECK_NULL_RETURN(bias_data);
+  state_bias_ =
+    reinterpret_cast<float16_t *>(malloc(weight_segment_num_ * lstm_param_->state_col_align_ * sizeof(float16_t)));
+  MS_CHECK_TRUE_MSG(state_bias_ != nullptr, lite::RET_NULL_PTR, "LstmNonMindirCPUKernel malloc state_bias_ failed.");
+  (void)memset(state_bias_, 0, weight_segment_num_ * lstm_param_->state_col_align_ * sizeof(float16_t));
+  if (bias->data_type() == kNumberTypeFloat32) {
+    auto state_bias_data = reinterpret_cast<float *>(bias_data) + kGateNum * lstm_param_->hidden_size_;
+    PackLstmBiasFp32ToFp16(state_bias_, state_bias_data, weight_segment_num_, lstm_param_->hidden_size_,
+                           lstm_param_->state_col_align_, lstm_param_->bidirectional_, nullptr);
+  } else if (bias->data_type() == kNumberTypeFloat16) {
+    auto state_bias_data = reinterpret_cast<float16_t *>(bias_data) + kGateNum * lstm_param_->hidden_size_;
+    PackLstmBiasFp16(state_bias_, state_bias_data, weight_segment_num_, lstm_param_->hidden_size_,
+                     lstm_param_->state_col_align_, lstm_param_->bidirectional_, nullptr);
+  } else {
+    MS_LOG(ERROR) << "Unsupported data type of bias tensor for lstm.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int LstmNonMindirFp16CPUKernel::InitProjectWeight() {
+  if (in_tensors_.size() < C7NUM) {
+    return RET_OK;
+  }
+  auto weight_pro = in_tensors_[SEVENTH_INPUT];
+  auto shape = weight_pro->shape();
+  MS_CHECK_TRUE_MSG(shape.size() == C3NUM, lite::RET_ERROR, "Project-weight's shape must be 3D.");
+  auto weight_pro_data = weight_pro->data();
+  CHECK_NULL_RETURN(weight_pro_data);
+  int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
+  if (shape[0] != batch) {
+    MS_LOG(ERROR) << "Project-weight's shape[0] must be 1(bidirectional=false) or 2(bidirectional=true).";
+    return RET_ERROR;
+  }
+  int pro_col_align = lstm_param_->proj_col_align_;
+  weight_project_ptr_ =
+    reinterpret_cast<float16_t *>(malloc(batch * lstm_param_->hidden_size_ * pro_col_align * sizeof(float16_t)));
+  MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR,
+                    "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed.");
+
+  if (weight_need_pack_) {
+    if (weight_pro->data_type() == kNumberTypeFloat32) {
+      PackLstmWeightFp32ToFp16(weight_project_ptr_, reinterpret_cast<float *>(weight_pro_data), batch,
+                               lstm_param_->hidden_size_, lstm_param_->output_size_, pro_col_align, nullptr);
+    } else if (weight_pro->data_type() == kNumberTypeFloat16) {
+      PackLstmWeightFp16(weight_project_ptr_, reinterpret_cast<float16_t *>(weight_pro_data), batch,
+                         lstm_param_->hidden_size_, lstm_param_->output_size_, pro_col_align, nullptr);
+    } else {
+      MS_LOG(ERROR) << "Unsupported data type of weight_project tensor for lstm.";
+      return RET_ERROR;
+    }
+  } else {
+    if (weight_pro->data_type() == kNumberTypeFloat32) {
+      Float32ToFloat16(reinterpret_cast<float *>(weight_pro_data), weight_project_ptr_, weight_pro->ElementsNum());
+    } else if (weight_pro->data_type() == kNumberTypeFloat16) {
+      (void)memcpy(weight_project_ptr_, weight_pro_data, weight_pro->Size());
+    } else {
+      MS_LOG(ERROR) << "Unsupported data type of weight_project tensor for lstm.";
+      return RET_ERROR;
+    }
+  }
+  size_t bias_size = UP_ROUND(lstm_param_->output_size_, C8NUM) * sizeof(float16_t);
+  project_bias_ = reinterpret_cast<float16_t *>(malloc(bias_size));
+  MS_CHECK_TRUE_MSG(project_bias_ != nullptr, lite::RET_NULL_PTR,
+                    "LstmNonMindirCPUKernel malloc project_bias_ failed.");
+  (void)memset(project_bias_, 0, bias_size);
+  return RET_OK;
+}
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h
new file mode 100644
index 00000000..132ef1cf
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_NON_MINDIR_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_NON_MINDIR_FP16_H_
+
+#include <vector>
+#include "src/litert/kernel/cpu/fp16/lstm_fp16_base.h"
+
+namespace mindspore::kernel {
+/*
+ * 1. LSTM without project, output_size = hidden_size
+ *    weight_ih: second input, shape is [bidirectional, 4 * hidden_size, input_size]
+ *    weight_hh: third input, shape is [bidirectional, 4 * hidden_size, hidden_size]
+ *    bias: forth input, shape is [bidirectional, 8 * hidden_size]
+ *    h_init: fifth input, shape is [bidirectional, batch_size, hidden_size]
+ *    c_init: sixth input, shape is [bidirectional, batch_size, hidden_size]
+ *
+ * 2. LSTM with project, output_size = project_size
+ *    weight_ih: second input, shape is [bidirectional, 4 * hidden_size, input_size]
+ *    weight_hh: third input, shape is [bidirectional, 4 * hidden_size, project_size]
+ *    bias: forth input, shape is [bidirectional, 8 * hidden_size]
+ *    h_init: fifth input, shape is [bidirectional, batch_size, project_size]
+ *    c_init: sixth input, shape is [bidirectional, batch_size, hidden_size]
+ *    weight_pro: seventh input, shape is [bidirectional, project_size, hidden_size]
+ */
+class LstmNonMindirFp16CPUKernel : public LstmFp16BaseCPUKernel {
+ public:
+  LstmNonMindirFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                             const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : LstmFp16BaseCPUKernel(parameter, inputs, outputs, ctx) {
+    hidden_init_index_ = FIFTH_INPUT;
+    cell_init_index_ = SIXTH_INPUT;
+  }
+
+  ~LstmNonMindirFp16CPUKernel() override = default;
+
+  int Prepare() override;
+
+ protected:
+  int InitInputWeightBias() override;
+  int InitStateWeightBias() override;
+  int InitProjectWeight() override;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_NON_MINDIR_FP16_H_
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/matmul_base_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/matmul_base_fp16.cc
index 8adb97b9..d6f94fd9 100644
--- a/mindspore/lite/src/litert/kernel/cpu/fp16/matmul_base_fp16.cc
+++ b/mindspore/lite/src/litert/kernel/cpu/fp16/matmul_base_fp16.cc
@@ -187,13 +187,13 @@ void MatmulBaseFP16CPUKernel::InitMatrixA(const void *src_ptr) {
     float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
     if (params_->a_transpose_) {
 #ifdef ENABLE_ARM64
-      RowMajor2RowNMajorFp16((const float16_t *)src, dst, params_->deep_, params_->row_);
+      RowMajor2RowNMajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32);
 #else
       RowMajor2Row12MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32);
 #endif
     } else {
 #ifdef ENABLE_ARM64
-      RowMajor2ColNMajorFp16((const float16_t *)src, dst, params_->row_, params_->deep_);
+      RowMajor2ColNMajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32);
 #else
       RowMajor2Col12MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32);
 #endif
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32.cc
index 0b67f2c2..67f42265 100644
--- a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32.cc
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2023 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,11 @@
  * limitations under the License.
  */

-#include "src/litert/kernel/cpu/fp32/lstm_fp32.h"
-#include <cfloat>
 #include <vector>
-#include "schema/model_generated.h"
+#include "src/litert//kernel/cpu/fp32/lstm_mindir_fp32.h"
+#include "src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h"
 #include "src/litert/kernel_registry.h"
 #include "include/errorcode.h"
-#include "nnacl/fp32/pack_fp32.h"
-#include "nnacl/fp32/matmul_fp32.h"

 using mindspore::kernel::KERNEL_ARCH;
 using mindspore::lite::KernelRegistrar;
@@ -32,664 +29,31 @@ using mindspore::schema::PrimitiveType_LSTM;

 namespace mindspore::kernel {
 namespace {
-constexpr int kOutputHiddenStatusIndex = 1;
-constexpr int kOutputCellStatusIndex = 2;
-}  // namespace
-
-int LstmInputMulWeightRun(void *cdata, int task_id, float, float) {
-  auto kernel = reinterpret_cast<const LstmCPUKernel *>(cdata);
-  CHECK_NULL_RETURN(kernel);
-  kernel->InputWeightMatMul(task_id);
-  return RET_OK;
-}
-
-int LstmSequenceLoopRun(void *cdata, int task_id, float, float) {
-  auto kernel = reinterpret_cast<LstmCPUKernel *>(cdata);
-  CHECK_NULL_RETURN(kernel);
-  auto ret = kernel->DoSequenceLoop(task_id);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LSTM: Do Sequence-loop failed.";
-  }
-  return ret;
-}
-
-void LstmCPUKernel::FreeRunBuffer() {
-  for (auto data : buffer_running_malloc_) {
-    ms_context_->allocator->Free(data);
-  }
-  buffer_running_malloc_.clear();
-}
-
-int LstmCPUKernel::InitInputWeightBias() {
-  // malloc and init input * weight right matrix buffer
-  // input -- row: seq_len * batch; col: input_size
-  // weight -- row: hidden_size; col: input_size, need transpose
-  // result -- row: seq_len * batch; col: hidden_size
-  weight_i_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(
-    weight_batch_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float)));
-  if (weight_i_ptr_ == nullptr) {
-    MS_LOG(ERROR) << "LstmCPUKernel malloc weight_i_ptr_ error.";
-    return RET_ERROR;
-  }
-  buffer_running_malloc_.push_back(weight_i_ptr_);
-  int i_index = (in_tensors_.size() == mindir_input_tensors) ? combined_weights_index : onnx_weight_i_index;
-  const int *weights_order = (in_tensors_.size() == mindir_input_tensors) ? weights_order_IFOG : nullptr;
-  auto weight_i = in_tensors_.at(i_index);
-  auto weight_i_data = reinterpret_cast<float *>(weight_i->data());
-
-  CHECK_NULL_RETURN(weight_i_data);
-  int cw_size = (lstm_param_->input_size_ * lstm_param_->hidden_size_);
-  int hh_size = (lstm_param_->hidden_size_ * lstm_param_->hidden_size_);
-  int b_size = (lstm_param_->hidden_size_);
-  bool has_bias = (weight_batch_ * (cw_size + hh_size) < weight_i->ElementsNum()) ? true : false;
-  int stride = (gpu_orig_state_) ? gate_num * (cw_size + hh_size) : gate_num * (cw_size);
-  PackLstmWeightWithStride(weight_i_ptr_, weight_i_data, weight_batch_, lstm_param_->input_size_,
-                           lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_,
-                           stride, weights_order);
-  // input bias
-  input_bias_ = reinterpret_cast<float *>(
-    ms_context_->allocator->Malloc(weight_batch_ * lstm_param_->input_col_align_ * sizeof(float)));
-  if (input_bias_ == nullptr) {
-    MS_LOG(ERROR) << "LstmCPUKernel malloc input_bias_ error.";
-    return RET_ERROR;
-  }
-  memset(input_bias_, 0, weight_batch_ * lstm_param_->input_col_align_ * sizeof(float));
-  buffer_running_malloc_.push_back(input_bias_);
-
-  int offset = weight_batch_ * (cw_size + hh_size);
-  float *bias_data = (has_bias) ? weight_i_data + offset : nullptr;
-  int dir_mul = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
-  int b_stride = (gpu_orig_state_) ? gate_num * (dir_mul * b_size) : gate_num * (b_size);
-  if (in_tensors_.size() > mindir_input_tensors) {
-    bias_data = reinterpret_cast<float *>(in_tensors_.at(onnx_bias_index)->data());
-    CHECK_NULL_RETURN(bias_data);
-    PackLstmBias(input_bias_, bias_data, weight_batch_, lstm_param_->hidden_size_, lstm_param_->input_col_align_,
-                 lstm_param_->bidirectional_, weights_order);
-  } else {
-    if (bias_data != nullptr) {
-      PackLstmBiasWithStride(input_bias_, bias_data, weight_batch_, lstm_param_->hidden_size_,
-                             lstm_param_->input_col_align_, lstm_param_->bidirectional_, b_stride, weights_order);
-    }
-  }
-  return RET_OK;
-}
-
-int LstmCPUKernel::InitStateWeightBias() {
-  // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times.
-  // state -- row: batch; col: hidden_size
-  // weight -- row: hidden_size; col: hidden_size, need transpose
-  // result -- row: batch; col: hidden_size
-  int weight_i_size = weight_batch_ * lstm_param_->hidden_size_ * lstm_param_->input_size_;
-  int h_index = (in_tensors_.size() == mindir_input_tensors) ? combined_weights_index : onnx_weight_h_index;
-  auto weight_h = in_tensors_.at(h_index);
-  auto weight_h_data = (reinterpret_cast<float *>(weight_h->data()));
-
-  int cw_size = (lstm_param_->input_size_ * lstm_param_->hidden_size_);
-  int hh_size = (lstm_param_->hidden_size_ * lstm_param_->project_size_);
-  int b_size = (lstm_param_->hidden_size_);
-  int stride = (gpu_orig_state_) ? gate_num * (cw_size + hh_size) : gate_num * (hh_size);
-
-  if (in_tensors_.size() == mindir_input_tensors) {
-    if (gpu_orig_state_) {
-      weight_h_data += gate_num * cw_size;
-    } else {
-      weight_h_data += weight_i_size;
-    }
-  }
-  CHECK_NULL_RETURN(weight_h_data);
-  if (!state_is_vec_) {
-    weight_h_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(
-      weight_batch_ * lstm_param_->state_col_align_ * lstm_param_->project_size_ * sizeof(float)));
-    if (weight_h_ptr_ == nullptr) {
-      MS_LOG(ERROR) << "LstmCPUKernel malloc weight_h_ptr_ error.";
-      return RET_ERROR;
-    }
-    buffer_running_malloc_.push_back(weight_h_ptr_);
-    const int *weights_order = (in_tensors_.size() == mindir_input_tensors) ? weights_order_IFOG : nullptr;
-    PackLstmWeightWithStride(weight_h_ptr_, weight_h_data, weight_batch_, lstm_param_->project_size_,
-                             lstm_param_->hidden_size_, lstm_param_->state_col_align_, lstm_param_->bidirectional_,
-                             stride, weights_order);
-  } else {
-#ifdef ENABLE_AVX
-    weight_h_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(
-      weight_batch_ * lstm_param_->state_col_align_ * lstm_param_->project_size_ * sizeof(float)));
-    if (weight_h_ptr_ == nullptr) {
-      MS_LOG(ERROR) << "LstmCPUKernel malloc weight_h_ptr_ error.";
-      return RET_ERROR;
-    }
-    buffer_running_malloc_.push_back(weight_h_ptr_);
-    for (int i = 0; i < weight_batch_; i++) {
-      const float *src_batch = weight_h_data + i * lstm_param_->hidden_size_ * lstm_param_->project_size_;
-      float *dst_batch = weight_h_ptr_ + i * lstm_param_->state_col_align_ * lstm_param_->project_size_;
-      RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->hidden_size_, lstm_param_->project_size_);
-    }
-#else
-    weight_h_ptr_ = weight_h_data;
-#endif
-  }
-
-  // state bias
-  int weight_h_size = weight_batch_ * lstm_param_->hidden_size_ * lstm_param_->hidden_size_;
-  int bias_size = weight_batch_ * lstm_param_->hidden_size_;
-  state_bias_ = reinterpret_cast<float *>(
-    ms_context_->allocator->Malloc(weight_batch_ * lstm_param_->state_col_align_ * sizeof(float)));
-  if (state_bias_ == nullptr) {
-    MS_LOG(ERROR) << "LstmCPUKernel malloc state_bias_ error.";
-    return RET_ERROR;
-  }
-  memset(state_bias_, 0, weight_batch_ * lstm_param_->state_col_align_ * sizeof(float));
-  buffer_running_malloc_.push_back(state_bias_);
-  // if ONNX, secend bias is also present order IOFG
-  if (in_tensors_.size() > mindir_input_tensors) {
-    float *state_bias =
-      reinterpret_cast<float *>(in_tensors_.at(onnx_bias_index)->data()) + gate_num * lstm_param_->hidden_size_;
-    CHECK_NULL_RETURN(state_bias);
-    PackLstmBias(state_bias_, state_bias, weight_batch_, lstm_param_->hidden_size_, lstm_param_->state_col_align_,
-                 lstm_param_->bidirectional_, nullptr);
-  } else if (weight_h->ElementsNum() - weight_i_size - weight_h_size - C2NUM * bias_size == 0) {
-    // mindir from device "GPU", secend bias is also present order IFOG
-    int dir_mul = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
-    int bias_offset = (gpu_orig_state_) ? gate_num * ((dir_mul - C1NUM) * cw_size + dir_mul * hh_size + b_size)
-                                        : weight_h_size + bias_size;
-    float *state_bias = weight_h_data + bias_offset;
-    int b_stride = (gpu_orig_state_) ? gate_num * (b_size * C2NUM) : gate_num * b_size;
-    PackLstmBiasWithStride(state_bias_, state_bias, weight_batch_, lstm_param_->hidden_size_,
-                           lstm_param_->state_col_align_, lstm_param_->bidirectional_, b_stride, weights_order_IFOG);
-  }
-  return RET_OK;
-}
-
-int LstmCPUKernel::InitProjectWeight() {
-  if (in_tensors_.size() < C7NUM) {
-    return RET_OK;
-  }
-  auto weight_pro = in_tensors_.at(SEVENTH_INPUT);
-  auto shape = weight_pro->shape();
-  if (shape.size() != C3NUM) {
-    MS_LOG(ERROR) << "Project-weight's shape must be 3D.";
-    return RET_ERROR;
-  }
-  auto weight_pro_data = reinterpret_cast<float *>(weight_pro->data());
-  CHECK_NULL_RETURN(weight_pro_data);
-  int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
-  if (shape[0] != batch) {
-    MS_LOG(ERROR) << "Project-weight's shape[0] must be 1(bidirectional=false) or 2(bidirectional=true).";
-    return RET_ERROR;
-  }
-  int col_align = UP_ROUND(lstm_param_->project_size_, col_tile_);
-  if (!state_is_vec_) {
-    weight_project_ptr_ = reinterpret_cast<float *>(
-      ms_context_->allocator->Malloc(batch * lstm_param_->hidden_size_ * col_align * sizeof(float)));
-    if (weight_project_ptr_ == nullptr) {
-      MS_LOG(ERROR) << "LstmCPUKernel malloc weight_project_ptr_ error.";
-      return RET_ERROR;
-    }
-    buffer_running_malloc_.push_back(weight_project_ptr_);
-    PackLstmWeightWithStride(weight_project_ptr_, weight_pro_data, batch, lstm_param_->hidden_size_,
-                             lstm_param_->project_size_, col_align, lstm_param_->bidirectional_,
-                             lstm_param_->hidden_size_ * lstm_param_->project_size_, nullptr);
-  } else {
-#ifdef ENABLE_AVX
-    weight_project_ptr_ = reinterpret_cast<float *>(
-      ms_context_->allocator->Malloc(batch * lstm_param_->hidden_size_ * col_align * sizeof(float)));
-    if (weight_project_ptr_ == nullptr) {
-      MS_LOG(ERROR) << "LstmCPUKernel malloc weight_project_ptr_ error.";
-      return RET_ERROR;
-    }
-    buffer_running_malloc_.push_back(weight_project_ptr_);
-    for (int i = 0; i < batch; ++i) {
-      const float *src_batch = weight_pro_data + i * lstm_param_->hidden_size_ * lstm_param_->project_size_;
-      float *dst_batch = weight_project_ptr_ + i * lstm_param_->hidden_size_ * col_align;
-      RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->project_size_, lstm_param_->hidden_size_);
-    }
-#else
-    weight_project_ptr_ = weight_pro_data;
-#endif
-  }
-  return RET_OK;
-}
-
-int LstmCPUKernel::InitParam() {
-  auto input = in_tensors_.front();
-  std::vector<int> in_shape = input->shape();
-  lstm_param_->seq_len_ = in_shape.at(FIRST_INPUT);
-  lstm_param_->batch_ = in_shape.at(SECOND_INPUT);
-  lstm_param_->input_size_ = in_shape.at(THIRD_INPUT);
-
-  auto weight_i = in_tensors_.at(onnx_weight_i_index);
-  std::vector<int> w_shape = weight_i->shape();
-  if (in_tensors_.size() == mindir_input_tensors) {
-    hidden_state_input_index_ = mindir_hidden_state_input_index;
-    cell_state_input_index_ = mindir_cell_state_input_index;
-    lstm_param_->hidden_size_ = w_shape.at(THIRD_INPUT);
-    lstm_param_->project_size_ = lstm_param_->hidden_size_;
-  } else {
-    lstm_param_->hidden_size_ = w_shape.at(SECOND_INPUT) / gate_num;
-    auto weight_h = in_tensors_[THIRD_INPUT];
-    auto h_shape = weight_h->shape();
-    lstm_param_->project_size_ = h_shape.back();
-  }
-
-  lstm_param_->output_step_ = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->hidden_size_
-                                                          : lstm_param_->batch_ * lstm_param_->hidden_size_;
-  weight_batch_ = lstm_param_->bidirectional_ ? C2NUM * gate_num : gate_num;
-  state_is_vec_ = lstm_param_->batch_ == 1;
-  // determine FB origin
-  gpu_orig_state_ = false;
-  if (in_tensors_.size() == mindir_input_tensors) {
-    gpu_orig_state_ = gpu_orig_cfg_;
-    auto weight_t = in_tensors_.at(combined_weights_index);
-    int cw_size = (lstm_param_->input_size_ * lstm_param_->hidden_size_);
-    int hh_size = (lstm_param_->hidden_size_ * lstm_param_->hidden_size_);
-    int b_size = (lstm_param_->hidden_size_);
-    bool has_bias = (weight_batch_ * (cw_size + hh_size) < weight_t->ElementsNum()) ? true : false;
-    // if bias exist we can determine the gpu_orig_state_
-    if (has_bias) {
-      gpu_orig_state_ =
-        (weight_batch_ * (cw_size + hh_size + C2NUM * b_size) == weight_t->ElementsNum()) ? true : false;
-    }
-  }
-
-#ifdef ENABLE_AVX
-  row_tile_ = C6NUM;
-  col_tile_ = C16NUM;
-#elif defined(ENABLE_ARM32)
-  row_tile_ = C12NUM;
-  col_tile_ = C4NUM;
-#elif defined(ENABLE_SSE)
-  row_tile_ = C4NUM;
-  col_tile_ = C8NUM;
-#else
-  row_tile_ = C12NUM;
-  col_tile_ = C8NUM;
-#endif
-  lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, row_tile_);
-  lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, col_tile_);
-  input_thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(lstm_param_->input_col_align_, col_tile_));
-  MS_CHECK_FALSE(input_thread_count_ == 0, RET_ERROR);
-  input_thread_stride_ = UP_DIV(UP_DIV(lstm_param_->input_col_align_, col_tile_), input_thread_count_);
-
-  state_row_tile_ = row_tile_;
-  state_col_tile_ = col_tile_;
-#ifdef ENABLE_AVX
-  if (state_is_vec_) {
-    state_row_tile_ = 1;
-    state_col_tile_ = C8NUM;
-  }
-#endif
-
-  lstm_param_->state_row_align_ = state_is_vec_ ? 1 : UP_ROUND(lstm_param_->batch_, state_row_tile_);
-#ifdef ENABLE_AVX
-  lstm_param_->state_col_align_ = UP_ROUND(lstm_param_->hidden_size_, state_col_tile_);
-#else
-  lstm_param_->state_col_align_ =
-    state_is_vec_ ? lstm_param_->hidden_size_ : UP_ROUND(lstm_param_->hidden_size_, state_col_tile_);
-#endif
-  return RET_OK;
+constexpr size_t kMindirInputTensorNum = 4;
 }
-
-int LstmCPUKernel::Prepare() {
-  CHECK_LESS_RETURN(in_tensors_.size(), mindir_input_tensors);
-  for (size_t i = 0; i < in_tensors_.size(); i++) {
-    CHECK_NULL_RETURN(in_tensors_.at(i));
-  }
-  CHECK_LESS_RETURN(out_tensors_.size(), DIMENSION_3D);
-  for (size_t i = 0; i < out_tensors_.size(); i++) {
-    CHECK_NULL_RETURN(out_tensors_.at(i));
-  }
-  CHECK_NULL_RETURN(lstm_param_);
-  if (!InferShapeDone()) {
-    return RET_OK;
-  }
-  return ReSize();
-}
-
-int LstmCPUKernel::ReSize() {
-  auto ret = InitParam();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LstmCPUKernel InitParam error.";
-    return RET_ERROR;
+LiteKernel *LstmFp32KernelCreator(const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs,
+                                  OpParameter *parameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc) {
+  if (parameter == nullptr) {
+    MS_LOG(ERROR) << "parameter is nullptr.";
+    return nullptr;
   }
-
-  return RET_OK;
-}
-
-int LstmCPUKernel::MallocRunBuffer(bool is_double) {
-  bool need_zone = lstm_param_->zoneout_cell_ < -FLT_EPSILON || lstm_param_->zoneout_cell_ > FLT_EPSILON;
-  size_t whole_size = 0;
-  std::vector<size_t> segments;
-  int scale = is_double ? C2NUM : 1;
-  size_t segment = gate_num * lstm_param_->seq_len_ * lstm_param_->batch_ *
-                   lstm_param_->hidden_size_;  // 0: input * weight for result matrix
-  segments.push_back(segment);
-  whole_size += segment * scale;
-
-  segment = state_is_vec_
-              ? 0
-              : lstm_param_->state_row_align_ * lstm_param_->project_size_;  // 1: state * weight for left matirx
-  segments.push_back(segment);
-  whole_size += segment * scale;
-
-  segment = gate_num * lstm_param_->batch_ * lstm_param_->hidden_size_;  // 2: state gate buffer
-  segments.push_back(segment);
-  whole_size += segment * scale;
-
-  segment = need_zone ? lstm_param_->batch_ * lstm_param_->hidden_size_ : 0;  // 3: state_buffer for cell
-  segments.push_back(segment);
-  whole_size += segment * scale;
-
-  segment = need_zone ? lstm_param_->batch_ * lstm_param_->project_size_ : 0;  // 4: state_buffer for hidden
-  segments.push_back(segment);
-  whole_size += segment * scale;
-
-  segment = 0;
-#ifdef ENABLE_AVX
-  bool output_need_packed = lstm_param_->hidden_size_ % state_col_tile_;
-  if (state_is_vec_ && output_need_packed) {  // vec matmul need to malloc dst
-    int out_channel = lstm_param_->hidden_size_;
-    int oc_block_num = UP_DIV(out_channel, state_col_tile_);
-    MS_ASSERT(ms_context_->allocator != nullptr);
-    segment = lstm_param_->batch_ * oc_block_num * state_col_tile_;  // 5: tmp output data
+  if (desc.data_type == kTypeUnknown) {
+    MS_LOG(WARNING) << "desc data_type is unknown.";
   }
-#endif
-  segments.push_back(segment);
-  whole_size += segment * scale;
-
-  if (in_tensors_.size() == C7NUM) {
-    segment = state_is_vec_ ? 0 : lstm_param_->state_row_align_ * lstm_param_->hidden_size_ * scale;
-    segments.push_back(segment);  // 6: project-layer input
-    whole_size += segment;
-    segment = 0;
-#ifdef ENABLE_AVX
-    segment =
-      output_need_packed ? lstm_param_->batch_ * UP_ROUND(lstm_param_->project_size_, state_col_tile_) * scale : 0;
-#endif
-    segments.push_back(segment);  // 7: project-layer output
-    whole_size += segment;
+  LiteKernel *kernel{nullptr};
+  if (inputs.size() == kMindirInputTensorNum) {
+    kernel = new (std::nothrow)
+      LstmMindirFp32CPUKernel(parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
   } else {
-    (void)segments.insert(segments.end(), C2NUM, 0);
-  }
-
-  segment = 0;
-  if (!(in_tensors_.size() > mindir_input_tensors)) {
-    segment = lstm_param_->batch_ * lstm_param_->hidden_size_;
-  }
-  segments.push_back(segment);
-  whole_size += segment * scale;
-
-  segment =
-    lstm_param_->input_row_align_ * lstm_param_->input_size_;  // input * weight for left matrix, which only once
-  whole_size += segment;
-
-  auto whole_memory = reinterpret_cast<float *>(ms_context_->allocator->Malloc(whole_size * sizeof(float)));
-  MS_CHECK_TRUE_MSG(whole_memory != nullptr, RET_ERROR, "LSTM: malloc failed.");
-  buffer_running_malloc_.push_back(whole_memory);
-  MS_ASSERT(segments.size() == C9NUM);
-  auto Allocate = [&whole_memory, &segments](float **buffer) mutable {
-    for (int i = 0; i < C9NUM; ++i) {
-      buffer[i] = nullptr;
-      if (segments[i] == 0) {
-        continue;
-      }
-      buffer[i] = whole_memory;
-      whole_memory += segments[i];
-    }
-  };
-  Allocate(buffer_forward_);
-  if (is_double) {
-    Allocate(buffer_backward_);
-  }
-  packed_input_ = whole_memory;
-  return RET_OK;
-}
-
-void LstmCPUKernel::InputWeightMatMul(int task_id) const {
-  int current_start_oc = task_id * input_thread_stride_ * col_tile_;
-  int current_rest_oc = 0;
-  current_rest_oc = lstm_param_->hidden_size_ - current_start_oc;
-  int cur_oc = MSMIN(input_thread_stride_ * col_tile_, current_rest_oc);
-  if (cur_oc <= 0) {
-    return;
-  }
-
-  auto b = weight_loop_ + current_start_oc * lstm_param_->input_size_;
-  auto c = gate_loop_ + current_start_oc;
-  auto bias = (bias_loop_ == nullptr) ? nullptr : bias_loop_ + current_start_oc;
-  MatMulOpt(packed_input_, b, c, bias, ActType_No, lstm_param_->input_size_,
-            lstm_param_->seq_len_ * lstm_param_->batch_, cur_oc, lstm_param_->hidden_size_, OutType_Nhwc);
-}
-
-int LstmCPUKernel::DoSequenceLoop(int task_id) {
-  if (task_id == 0) {
-    LstmForwardLoop(buffer_forward_);
-    return RET_OK;
-  }
-  if (task_id == 1) {
-    LstmBackwardLoop(buffer_backward_);
-    return RET_OK;
-  }
-  return RET_ERROR;
-}
-
-int LstmCPUKernel::LstmPreProcessWithInput(const float *weight_i, const float *input_bias, float *dst) {
-  for (int i = 0; i < gate_num; i++) {
-    weight_loop_ = weight_i + lstm_param_->input_size_ * lstm_param_->input_col_align_ * i;
-    bias_loop_ = input_bias + lstm_param_->input_col_align_ * i;
-    gate_loop_ = dst + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * i;
-    auto ret = ParallelLaunch(this->ms_context_, LstmInputMulWeightRun, this, input_thread_count_);
-    if (ret != RET_OK) {
-      return RET_ERROR;
-    }
-  }
-  return RET_OK;
-}
-
-void LstmCPUKernel::LstmUnidirectional(float *output, const float *weight_h, const float *state_bias,
-                                       float *hidden_state, float *cell_state, const float *weight_project,
-                                       float *intermediate_states, float *buffer[], bool is_backward) {
-  float *gate = buffer[input_gate_index];
-  float *input_gate = gate;
-  float *forget_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C2NUM;
-  float *cell_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C3NUM;
-  float *output_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_;
-  float *tmp = buffer[tmp_hidden_output_index];
-  int dir_mult = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
-  for (int t = 0; t < lstm_param_->seq_len_; t++) {
-    int real_t = is_backward ? lstm_param_->seq_len_ - t - C1NUM : t;
-    float *input_gate_t = input_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
-    float *forget_gate_t = forget_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
-    float *cell_gate_t = cell_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
-    float *output_gate_t = output_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
-    // if ONNX
-    if (in_tensors_.size() > mindir_input_tensors) {
-      // Sequence, DirMul, Batch, Hidden
-      float *output_ptr = output + real_t * lstm_param_->output_step_;
-
-      LstmStepUnit(output_ptr, input_gate_t, forget_gate_t, cell_gate_t, output_gate_t, weight_h, state_bias,
-                   weight_project, hidden_state, cell_state, buffer, lstm_param_);
-    } else {
-      // Sequence, Batch, DirMul, Hidden
-      LstmStepUnit(tmp, input_gate_t, forget_gate_t, cell_gate_t, output_gate_t, weight_h, state_bias, nullptr,
-                   hidden_state, cell_state, buffer, lstm_param_);
-      int seq_offset = real_t * lstm_param_->batch_ * dir_mult * lstm_param_->hidden_size_;
-      for (int b = 0; b < lstm_param_->batch_; b++) {
-        int batch_offset = b * dir_mult * lstm_param_->hidden_size_;
-        float *output_ptr = output + seq_offset + batch_offset;
-        memcpy(output_ptr, tmp + b * lstm_param_->hidden_size_, lstm_param_->hidden_size_ * sizeof(float));
-      }
-    }
-    if (intermediate_states) {
-      RecordStates(hidden_state, cell_state, input_gate_t, output_gate_t, forget_gate_t, cell_gate_t,
-                   intermediate_states, real_t);
-    }
-  }
-}
-
-void LstmCPUKernel::RecordStates(const float *hidden_state, float *cell_state, float *input_gate,
-                                 const float *output_gate, float *forget_gate, const float *cell_gate,
-                                 float *intermediate_states, int step) {
-  float *states = intermediate_states;
-  auto state_size = lstm_param_->batch_ * lstm_param_->hidden_size_;
-  if (state_size < 0) {
-    MS_LOG(ERROR) << "state size should be greater than or equal to zero.";
-    return;
-  }
-  auto stride = step * lstm_param_->output_step_;
-  auto seq_stride = lstm_param_->seq_len_ * lstm_param_->output_step_;
-  memcpy(states + stride, hidden_state, state_size * sizeof(float));
-  stride += seq_stride;
-  memcpy(states + stride, cell_state, state_size * sizeof(float));
-  stride += seq_stride;
-  memcpy(states + stride, input_gate, state_size * sizeof(float));
-  stride += seq_stride;
-  memcpy(states + stride, output_gate, state_size * sizeof(float));
-  stride += seq_stride;
-  memcpy(states + stride, forget_gate, state_size * sizeof(float));
-  stride += seq_stride;
-  memcpy(states + stride, cell_gate, state_size * sizeof(float));
-}
-
-void LstmCPUKernel::LstmForwardLoop(float *buffer[]) {
-  auto *output = reinterpret_cast<float *>(out_tensors_.at(0)->data());
-  auto *hidden_state = reinterpret_cast<float *>(out_tensors_.at(1)->data());
-  auto *cell_state = reinterpret_cast<float *>(out_tensors_.at(C2NUM)->data());
-  LstmUnidirectional(output, weight_h_ptr_, state_bias_, hidden_state, cell_state, weight_project_ptr_,
-                     intermediate_states_, buffer, false);
-}
-
-void LstmCPUKernel::LstmBackwardLoop(float *buffer[]) {
-  auto *output = reinterpret_cast<float *>(out_tensors_.at(0)->data());
-  auto *hidden_state = reinterpret_cast<float *>(out_tensors_.at(1)->data());
-  auto *cell_state = reinterpret_cast<float *>(out_tensors_.at(C2NUM)->data());
-  const float *backward_weight_h = weight_h_ptr_ + gate_num * lstm_param_->state_col_align_ * lstm_param_->hidden_size_;
-  const float *backward_state_bias = state_bias_ + gate_num * lstm_param_->state_col_align_;
-  float *backward_output = output + lstm_param_->batch_ * lstm_param_->hidden_size_;
-  if (in_tensors_.size() == mindir_input_tensors) {
-    backward_output = output + lstm_param_->hidden_size_;
-  }
-  float *backward_cell_state = cell_state + lstm_param_->batch_ * lstm_param_->hidden_size_;
-  float *backward_hidden_state = hidden_state + lstm_param_->batch_ * lstm_param_->hidden_size_;
-  float *intermediate_states = nullptr;
-  if (intermediate_states_) {
-    intermediate_states = intermediate_states_ + lstm_param_->batch_ * lstm_param_->hidden_size_;
-  }
-  float *backward_weight_project =
-    weight_project_ptr_
-      ? weight_project_ptr_ + lstm_param_->hidden_size_ * UP_ROUND(lstm_param_->project_size_, col_tile_)
-      : nullptr;
-  LstmUnidirectional(backward_output, backward_weight_h, backward_state_bias, backward_hidden_state,
-                     backward_cell_state, backward_weight_project, intermediate_states, buffer, true);
-}
-
-int LstmCPUKernel::ExecuteUnidirectionalOrSingleThread() {
-  auto ret = LstmPreProcessWithInput(weight_i_ptr_, input_bias_, buffer_forward_[input_gate_index]);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LSTM Forward: Input-MatMul running failed.";
-    return RET_ERROR;
-  }
-  LstmForwardLoop(buffer_forward_);
-
-  // backward
-  if (lstm_param_->bidirectional_) {
-    const float *backward_weight_i =
-      weight_i_ptr_ + gate_num * lstm_param_->input_col_align_ * lstm_param_->input_size_;
-    const float *backward_input_bias = input_bias_ + gate_num * lstm_param_->input_col_align_;
-    ret = LstmPreProcessWithInput(backward_weight_i, backward_input_bias, buffer_forward_[input_gate_index]);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "LSTM Backward: Input-MatMul running failed.";
-      return RET_ERROR;
-    }
-    LstmBackwardLoop(buffer_forward_);
-  }
-  return RET_OK;
-}
-
-int LstmCPUKernel::ExecuteBidirectionalWithMultiThread() {
-  auto ret = LstmPreProcessWithInput(weight_i_ptr_, input_bias_, buffer_forward_[input_gate_index]);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LSTM Forward: Input-MatMul running failed.";
-    return RET_ERROR;
-  }
-  const float *backward_weight_i = weight_i_ptr_ + gate_num * lstm_param_->input_col_align_ * lstm_param_->input_size_;
-  const float *backward_input_bias = input_bias_ + gate_num * lstm_param_->input_col_align_;
-  ret = LstmPreProcessWithInput(backward_weight_i, backward_input_bias, buffer_backward_[input_gate_index]);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LSTM Backward: Input-MatMul running failed.";
-    return RET_ERROR;
-  }
-  ret = ParallelLaunch(this->ms_context_, LstmSequenceLoopRun, this, C2NUM);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LSTM: Do sequence-loop failed.";
-  }
-  return ret;
-}
-
-int LstmCPUKernel::Run() {
-  auto input = in_tensors_.at(0);
-  auto output = out_tensors_.at(0);
-  CHECK_NULL_RETURN(input);
-  CHECK_NULL_RETURN(output);
-  auto input_ptr = reinterpret_cast<float *>(input->data());
-  CHECK_NULL_RETURN(input_ptr);
-  auto output_ptr = reinterpret_cast<float *>(output->data());
-  CHECK_NULL_RETURN(output_ptr);
-
-  auto hidden_state = in_tensors_.at(hidden_state_input_index_);
-  CHECK_NULL_RETURN(hidden_state->data());
-  auto cell_state = in_tensors_.at(cell_state_input_index_);
-  CHECK_NULL_RETURN(cell_state->data());
-
-  auto output_hidden_state = out_tensors_[kOutputHiddenStatusIndex];
-  CHECK_NULL_RETURN(output_hidden_state->data());
-  (void)memcpy(output_hidden_state->data(), hidden_state->data(), hidden_state->ElementsNum() * sizeof(float));
-  auto output_cell_state = out_tensors_[kOutputCellStatusIndex];
-  CHECK_NULL_RETURN(output_cell_state->data());
-  (void)memcpy(output_cell_state->data(), cell_state->data(), cell_state->ElementsNum() * sizeof(float));
-
-  auto ret = InitInputWeightBias();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LstmCPUKernel InitInputWeightBias error.";
-    FreeRunBuffer();
-    return RET_ERROR;
-  }
-
-  ret = InitStateWeightBias();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LstmCPUKernel InitStateWeightBias error.";
-    FreeRunBuffer();
-    return RET_ERROR;
+    kernel = new (std::nothrow)
+      LstmNonMindirFp32CPUKernel(parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
   }
-
-  ret = InitProjectWeight();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LstmCPUKernel InitProjectWeight error.";
-    FreeRunBuffer();
-    return RET_ERROR;
-  }
-  bool is_bidirectional_with_multi_thread = thread_num_ != 1 && lstm_param_->bidirectional_;
-  ret = MallocRunBuffer(is_bidirectional_with_multi_thread);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "LstmCPUKernel MallocRunBuffer Error.";
-    FreeRunBuffer();
-    return RET_ERROR;
-  }
-
-  PackLstmInput(input_ptr, packed_input_, lstm_param_->seq_len_ * lstm_param_->batch_, lstm_param_->input_size_);
-  if (IsTrain() && IsTrainable()) {
-    intermediate_states_ = reinterpret_cast<float *>(out_tensors_[out_intermediate_states_index]->data());
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "kernel: " << parameter->name_ << "is nullptr.";
+    free(parameter);
+    return nullptr;
   }
-  CHECK_NULL_RETURN(weight_h_ptr_);
-  CHECK_NULL_RETURN(weight_i_ptr_);
-  CHECK_NULL_RETURN(input_bias_);
-  CHECK_NULL_RETURN(state_bias_);
-  if (is_bidirectional_with_multi_thread) {
-    ret = ExecuteBidirectionalWithMultiThread();
-  } else {
-    ret = ExecuteUnidirectionalOrSingleThread();
-  }
-  FreeRunBuffer();
-  return ret;
+  return kernel;
 }
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_LSTM, LiteKernelCreator<LstmCPUKernel>)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_LSTM, LstmFp32KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.cc
new file mode 100644
index 00000000..bd0f0e7d
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.cc
@@ -0,0 +1,398 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/cpu/fp32/lstm_fp32_base.h"
+#include <vector>
+#include "include/errorcode.h"
+#include "nnacl/fp32/pack_fp32.h"
+#include "nnacl/fp32/matmul_fp32.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_MEMORY_FAILED;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+namespace {
+constexpr size_t kMindirInputTensorNum = 4;
+constexpr int kGateNum = 4;
+constexpr int kOutIntermediateStatesIndex = 3;
+constexpr int kInputGateIndex = 0;
+}  // namespace
+
+int LstmSequenceLoopRun(void *cdata, int task_id, float, float) {
+  auto kernel = reinterpret_cast<LstmFp32BaseCPUKernel *>(cdata);
+  CHECK_NULL_RETURN(kernel);
+  auto ret = kernel->DoSequenceLoop(task_id);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LSTM: Do Sequence-loop failed.";
+  }
+  return ret;
+}
+
+int LstmFp32BaseCPUKernel::Prepare() {
+  MS_CHECK_TRUE_MSG(in_tensors_.size() == kMindirInputTensorNum || in_tensors_.size() >= C6NUM,
+                    lite::RET_INPUT_TENSOR_ERROR, "Lstm's input-num is invalid.");
+  for (size_t i = 0; i < in_tensors_.size(); i++) {
+    CHECK_NULL_RETURN(in_tensors_.at(i));
+  }
+  CHECK_LESS_RETURN(out_tensors_.size(), DIMENSION_3D);
+  for (size_t i = 0; i < out_tensors_.size(); i++) {
+    CHECK_NULL_RETURN(out_tensors_.at(i));
+  }
+  CHECK_NULL_RETURN(lstm_param_);
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int LstmFp32BaseCPUKernel::ReSize() {
+  auto input = in_tensors_.front();
+  std::vector<int> in_shape = input->shape();
+  MS_CHECK_TRUE_MSG(in_shape.size() == C3NUM, lite::RET_INPUT_TENSOR_ERROR,
+                    "The dims of LSTM's first input must be 3.");
+  lstm_param_->seq_len_ = in_shape.at(FIRST_INPUT);
+  lstm_param_->batch_ = in_shape.at(SECOND_INPUT);
+  lstm_param_->input_size_ = in_shape.at(THIRD_INPUT);
+
+  auto h_init_shape = in_tensors_.at(hidden_init_index_)->shape();
+  auto c_init_shape = in_tensors_.at(cell_init_index_)->shape();
+  lstm_param_->hidden_size_ = c_init_shape.back();
+  lstm_param_->output_size_ = h_init_shape.back();
+
+  lstm_param_->output_step_ = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->output_size_
+                                                          : lstm_param_->batch_ * lstm_param_->output_size_;
+  weight_segment_num_ = lstm_param_->bidirectional_ ? C2NUM * kGateNum : kGateNum;
+
+#ifdef ENABLE_AVX
+  row_tile_ = C6NUM;
+  col_tile_ = C16NUM;
+#elif defined(ENABLE_ARM32)
+  row_tile_ = C12NUM;
+  col_tile_ = C4NUM;
+#elif defined(ENABLE_SSE)
+  row_tile_ = C4NUM;
+  col_tile_ = C8NUM;
+#else
+  row_tile_ = C12NUM;
+  col_tile_ = C8NUM;
+#endif
+  lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, row_tile_);
+  lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, col_tile_);
+
+  state_row_tile_ = row_tile_;
+  state_col_tile_ = col_tile_;
+#ifdef ENABLE_AVX
+  if (lstm_param_->batch_ == 1) {
+    state_row_tile_ = 1;
+    state_col_tile_ = C8NUM;
+  }
+#endif
+
+  lstm_param_->state_row_align_ = lstm_param_->batch_ == 1 ? 1 : UP_ROUND(lstm_param_->batch_, state_row_tile_);
+#ifdef ENABLE_AVX
+  lstm_param_->state_col_align_ = UP_ROUND(lstm_param_->hidden_size_, state_col_tile_);
+  lstm_param_->proj_col_align_ = UP_ROUND(lstm_param_->output_size_, state_col_tile_);
+#else
+  lstm_param_->state_col_align_ =
+    lstm_param_->batch_ == 1 ? lstm_param_->hidden_size_ : UP_ROUND(lstm_param_->hidden_size_, state_col_tile_);
+  lstm_param_->proj_col_align_ =
+    lstm_param_->batch_ == 1 ? lstm_param_->output_size_ : UP_ROUND(lstm_param_->output_size_, state_col_tile_);
+#endif
+  return RET_OK;
+}
+
+int LstmFp32BaseCPUKernel::Run() {
+  auto input = in_tensors_.at(FIRST_INPUT);
+  auto output = out_tensors_.at(FIRST_INPUT);
+  auto input_ptr = reinterpret_cast<float *>(input->data());
+  CHECK_NULL_RETURN(input_ptr);
+  auto output_ptr = reinterpret_cast<float *>(output->data());
+  CHECK_NULL_RETURN(output_ptr);
+
+  auto hidden_state = in_tensors_.at(hidden_init_index_);
+  CHECK_NULL_RETURN(hidden_state->data());
+  auto cell_state = in_tensors_.at(cell_init_index_);
+  CHECK_NULL_RETURN(cell_state->data());
+
+  auto output_hidden_state = out_tensors_[SECOND_INPUT];
+  CHECK_NULL_RETURN(output_hidden_state->data());
+  (void)memcpy(output_hidden_state->data(), hidden_state->data(), hidden_state->ElementsNum() * sizeof(float));
+  auto output_cell_state = out_tensors_[THIRD_INPUT];
+  CHECK_NULL_RETURN(output_cell_state->data());
+  (void)memcpy(output_cell_state->data(), cell_state->data(), cell_state->ElementsNum() * sizeof(float));
+
+  auto ret = InitInputWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LstmCPUKernel InitInputWeightBias error.";
+    FreeRunBuffer();
+    return RET_ERROR;
+  }
+
+  ret = InitStateWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LstmCPUKernel InitStateWeightBias error.";
+    FreeRunBuffer();
+    return RET_ERROR;
+  }
+
+  ret = InitProjectWeight();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LstmCPUKernel InitProjectWeight error.";
+    FreeRunBuffer();
+    return RET_ERROR;
+  }
+  bool is_bidirectional_with_multi_thread = thread_num_ != 1 && lstm_param_->bidirectional_;
+  ret = MallocRunBuffer(is_bidirectional_with_multi_thread);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LstmCPUKernel MallocRunBuffer Error.";
+    FreeRunBuffer();
+    return RET_ERROR;
+  }
+
+  PackLstmInput(input_ptr, packed_input_, lstm_param_->seq_len_ * lstm_param_->batch_, lstm_param_->input_size_);
+  if (IsTrain() && IsTrainable()) {
+    intermediate_states_ = reinterpret_cast<float *>(out_tensors_[kOutIntermediateStatesIndex]->data());
+  }
+  CHECK_NULL_RETURN(weight_h_ptr_);
+  CHECK_NULL_RETURN(weight_i_ptr_);
+  CHECK_NULL_RETURN(input_bias_);
+  CHECK_NULL_RETURN(state_bias_);
+  if (is_bidirectional_with_multi_thread) {
+    ret = ExecuteBidirectionalWithMultiThread();
+  } else {
+    ret = ExecuteUnidirectionalOrSingleThread();
+  }
+  FreeRunBuffer();
+  return ret;
+}
+
+void LstmFp32BaseCPUKernel::FreeRunBuffer() {
+  for (auto data : running_buffer_) {
+    ms_context_->allocator->Free(data);
+  }
+  running_buffer_.clear();
+}
+
+int LstmFp32BaseCPUKernel::MallocRunBuffer(bool is_double) {
+  bool need_zone = lstm_param_->zoneout_cell_ < -FLT_EPSILON || lstm_param_->zoneout_cell_ > FLT_EPSILON;
+  size_t whole_size = 0;
+  std::vector<size_t> segments;
+  int scale = is_double ? C2NUM : 1;
+  size_t segment = kGateNum * lstm_param_->seq_len_ * lstm_param_->batch_ *
+                   lstm_param_->hidden_size_;  // 0: input * weight for result matrix
+  segments.push_back(segment);
+  whole_size += segment * scale;
+
+  segment = lstm_param_->batch_ == 1
+              ? 0
+              : lstm_param_->state_row_align_ * lstm_param_->output_size_;  // 1: state * weight for left matirx
+  segments.push_back(segment);
+  whole_size += segment * scale;
+
+  segment = kGateNum * lstm_param_->batch_ * lstm_param_->hidden_size_;  // 2: state gate buffer
+  segments.push_back(segment);
+  whole_size += segment * scale;
+
+  segment = need_zone ? lstm_param_->batch_ * lstm_param_->hidden_size_ : 0;  // 3: state_buffer for cell
+  segments.push_back(segment);
+  whole_size += segment * scale;
+
+  segment = need_zone ? lstm_param_->batch_ * lstm_param_->output_size_ : 0;  // 4: state_buffer for hidden
+  segments.push_back(segment);
+  whole_size += segment * scale;
+
+  segment = 0;
+#ifdef ENABLE_AVX
+  bool output_need_packed = lstm_param_->hidden_size_ % state_col_tile_;
+  if (lstm_param_->batch_ == 1 && output_need_packed) {  // vec matmul need to malloc dst
+    int out_channel = lstm_param_->hidden_size_;
+    int oc_block_num = UP_DIV(out_channel, state_col_tile_);
+    MS_ASSERT(ms_context_->allocator != nullptr);
+    segment = lstm_param_->batch_ * oc_block_num * state_col_tile_;  // 5: tmp output data
+  }
+#endif
+  segments.push_back(segment);
+  whole_size += segment * scale;
+
+  if (in_tensors_.size() == C7NUM || lstm_param_->project_size_ != 0) {
+    segment = lstm_param_->batch_ == 1 ? 0 : lstm_param_->state_row_align_ * lstm_param_->hidden_size_ * scale;
+    segments.push_back(segment);  // 6: project-layer input
+    whole_size += segment;
+    segment = 0;
+#ifdef ENABLE_AVX
+    segment =
+      output_need_packed ? lstm_param_->batch_ * UP_ROUND(lstm_param_->output_size_, state_col_tile_) * scale : 0;
+#endif
+    segments.push_back(segment);  // 7: project-layer output
+    whole_size += segment;
+  } else {
+    (void)segments.insert(segments.end(), C2NUM, 0);
+  }
+
+  segment = 0;
+  if (in_tensors_.size() == kMindirInputTensorNum) {
+    segment = lstm_param_->batch_ * lstm_param_->output_size_;
+  }
+  segments.push_back(segment);
+  whole_size += segment * scale;
+
+  segment =
+    lstm_param_->input_row_align_ * lstm_param_->input_size_;  // input * weight for left matrix, which only once
+  whole_size += segment;
+
+  auto whole_memory = reinterpret_cast<float *>(ms_context_->allocator->Malloc(whole_size * sizeof(float)));
+  MS_CHECK_TRUE_MSG(whole_memory != nullptr, RET_ERROR, "LSTM: malloc failed.");
+  running_buffer_.push_back(whole_memory);
+  MS_ASSERT(segments.size() == C9NUM);
+  auto Allocate = [&whole_memory, &segments](float **buffer) mutable {
+    for (int i = 0; i < C9NUM; ++i) {
+      buffer[i] = nullptr;
+      if (segments[i] == 0) {
+        continue;
+      }
+      buffer[i] = whole_memory;
+      whole_memory += segments[i];
+    }
+  };
+  Allocate(buffer_forward_);
+  if (is_double) {
+    Allocate(buffer_backward_);
+  }
+  packed_input_ = whole_memory;
+  return RET_OK;
+}
+
+int LstmFp32BaseCPUKernel::ExecuteBidirectionalWithMultiThread() {
+  auto ret = LstmPreProcessWithInput(weight_i_ptr_, input_bias_, buffer_forward_[kInputGateIndex]);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LSTM Forward: Input-MatMul running failed.";
+    return RET_ERROR;
+  }
+  const float *backward_weight_i = weight_i_ptr_ + kGateNum * lstm_param_->input_col_align_ * lstm_param_->input_size_;
+  const float *backward_input_bias = input_bias_ + kGateNum * lstm_param_->input_col_align_;
+  ret = LstmPreProcessWithInput(backward_weight_i, backward_input_bias, buffer_backward_[kInputGateIndex]);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LSTM Backward: Input-MatMul running failed.";
+    return RET_ERROR;
+  }
+  ret = ParallelLaunch(this->ms_context_, LstmSequenceLoopRun, this, C2NUM);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LSTM: Do sequence-loop failed.";
+  }
+  return ret;
+}
+
+int LstmFp32BaseCPUKernel::ExecuteUnidirectionalOrSingleThread() {
+  auto ret = LstmPreProcessWithInput(weight_i_ptr_, input_bias_, buffer_forward_[kInputGateIndex]);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "LSTM Forward: Input-MatMul running failed.";
+    return RET_ERROR;
+  }
+  LstmForwardLoop(buffer_forward_);
+
+  // backward
+  if (lstm_param_->bidirectional_) {
+    const float *backward_weight_i =
+      weight_i_ptr_ + kGateNum * lstm_param_->input_col_align_ * lstm_param_->input_size_;
+    const float *backward_input_bias = input_bias_ + kGateNum * lstm_param_->input_col_align_;
+    ret = LstmPreProcessWithInput(backward_weight_i, backward_input_bias, buffer_forward_[kInputGateIndex]);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "LSTM Backward: Input-MatMul running failed.";
+      return RET_ERROR;
+    }
+    LstmBackwardLoop(buffer_forward_);
+  }
+  return RET_OK;
+}
+
+int LstmFp32BaseCPUKernel::LstmPreProcessWithInput(const float *weight_i, const float *input_bias, float *dst) {
+  const float *weight{nullptr};
+  const float *bias{nullptr};
+  float *gate{nullptr};
+  int thread_num = MSMIN(op_parameter_->thread_num_, UP_DIV(lstm_param_->input_col_align_, col_tile_));
+  MS_CHECK_FALSE(thread_num == 0, RET_ERROR);
+  int stride = UP_DIV(UP_DIV(lstm_param_->input_col_align_, col_tile_), thread_num);
+  auto MatMulCoreFunc = [this, &weight, &bias, &gate, &stride](void *, int task_id, float, float) {
+    int current_start_oc = task_id * stride * col_tile_;
+    int current_rest_oc = 0;
+    current_rest_oc = lstm_param_->hidden_size_ - current_start_oc;
+    int cur_oc = MSMIN(stride * col_tile_, current_rest_oc);
+    if (cur_oc <= 0) {
+      return RET_OK;
+    }
+
+    auto b = weight + current_start_oc * lstm_param_->input_size_;
+    auto c = gate + current_start_oc;
+    auto bias_ = (bias == nullptr) ? nullptr : bias + current_start_oc;
+    MatMulOpt(packed_input_, b, c, bias_, ActType_No, lstm_param_->input_size_,
+              lstm_param_->seq_len_ * lstm_param_->batch_, cur_oc, lstm_param_->hidden_size_, OutType_Nhwc);
+    return RET_OK;
+  };
+  for (int i = 0; i < kGateNum; i++) {
+    weight = weight_i + lstm_param_->input_size_ * lstm_param_->input_col_align_ * i;
+    bias = input_bias + lstm_param_->input_col_align_ * i;
+    gate = dst + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * i;
+    auto ret = ParallelLaunch(this->ms_context_, MatMulCoreFunc, nullptr, thread_num);
+    if (ret != RET_OK) {
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int LstmFp32BaseCPUKernel::DoSequenceLoop(int task_id) {
+  if (task_id == 0) {
+    LstmForwardLoop(buffer_forward_);
+    return RET_OK;
+  }
+  if (task_id == 1) {
+    LstmBackwardLoop(buffer_backward_);
+    return RET_OK;
+  }
+  return RET_ERROR;
+}
+
+void LstmFp32BaseCPUKernel::LstmForwardLoop(float *buffer[]) {
+  auto *output = reinterpret_cast<float *>(out_tensors_.at(FIRST_INPUT)->data());
+  auto *hidden_state = reinterpret_cast<float *>(out_tensors_.at(SECOND_INPUT)->data());
+  auto *cell_state = reinterpret_cast<float *>(out_tensors_.at(THIRD_INPUT)->data());
+  LstmUnidirectional(output, weight_h_ptr_, state_bias_, hidden_state, cell_state, weight_project_ptr_,
+                     intermediate_states_, buffer, false);
+}
+
+void LstmFp32BaseCPUKernel::LstmBackwardLoop(float *buffer[]) {
+  auto *output = reinterpret_cast<float *>(out_tensors_.at(0)->data());
+  auto *hidden_state = reinterpret_cast<float *>(out_tensors_.at(1)->data());
+  auto *cell_state = reinterpret_cast<float *>(out_tensors_.at(C2NUM)->data());
+  const float *backward_weight_h = weight_h_ptr_ + kGateNum * lstm_param_->state_col_align_ * lstm_param_->output_size_;
+  const float *backward_state_bias = state_bias_ + kGateNum * lstm_param_->state_col_align_;
+  float *backward_output = output + lstm_param_->batch_ * lstm_param_->output_size_;
+  if (in_tensors_.size() == kMindirInputTensorNum) {
+    backward_output = output + lstm_param_->output_size_;
+  }
+  float *backward_cell_state = cell_state + lstm_param_->batch_ * lstm_param_->hidden_size_;
+  float *backward_hidden_state = hidden_state + lstm_param_->batch_ * lstm_param_->output_size_;
+  float *intermediate_states = nullptr;
+  if (intermediate_states_) {
+    intermediate_states = intermediate_states_ + lstm_param_->batch_ * lstm_param_->output_size_;
+  }
+  float *backward_weight_project =
+    weight_project_ptr_ ? weight_project_ptr_ + lstm_param_->hidden_size_ * lstm_param_->proj_col_align_ : nullptr;
+  LstmUnidirectional(backward_output, backward_weight_h, backward_state_bias, backward_hidden_state,
+                     backward_cell_state, backward_weight_project, intermediate_states, buffer, true);
+}
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.h b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.h
new file mode 100644
index 00000000..c3c10cea
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.h
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_FP32_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_FP32_BASE_H_
+
+#include <vector>
+#include "src/litert/lite_kernel.h"
+#include "nnacl/fp32/lstm_fp32.h"
+
+namespace mindspore::kernel {
+class LstmFp32BaseCPUKernel : public LiteKernel {
+ public:
+  LstmFp32BaseCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                        const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : LiteKernel(parameter, inputs, outputs, ctx) {
+    lstm_param_ = reinterpret_cast<LstmParameter *>(op_parameter_);
+  }
+
+  ~LstmFp32BaseCPUKernel() override = default;
+
+  int Prepare() override;
+  int ReSize() override;
+  int Run() override;
+  int DoSequenceLoop(int task_id);
+
+ protected:
+  virtual int InitInputWeightBias() = 0;
+  virtual int InitStateWeightBias() = 0;
+  virtual int InitProjectWeight() = 0;
+  virtual void LstmUnidirectional(float *output, const float *weight_h, const float *state_bias, float *hidden_state,
+                                  float *cell_state, const float *weight_project, float *intermediate_states,
+                                  float *buffer[], bool is_backward) = 0;
+
+  int hidden_init_index_{0};
+  int cell_init_index_{0};
+  int row_tile_{0};
+  int col_tile_{0};
+  int state_row_tile_{0};
+  int state_col_tile_{0};
+  int weight_segment_num_{0};
+  float *weight_i_ptr_{nullptr};
+  float *weight_h_ptr_{nullptr};
+  float *weight_project_ptr_{nullptr};
+  float *input_bias_{nullptr};
+  float *state_bias_{nullptr};
+  LstmParameter *lstm_param_{nullptr};
+  std::vector<void *> running_buffer_;
+
+ private:
+  void FreeRunBuffer();
+  int MallocRunBuffer(bool is_double);
+  int ExecuteBidirectionalWithMultiThread();
+  int ExecuteUnidirectionalOrSingleThread();
+  int LstmPreProcessWithInput(const float *weight_i, const float *input_bias, float *dst);
+  void LstmForwardLoop(float *buffer[]);
+  void LstmBackwardLoop(float *buffer[]);
+  float *packed_input_{nullptr};
+  float *intermediate_states_{nullptr};
+  float *buffer_forward_[C9NUM] = {nullptr};
+  float *buffer_backward_[C9NUM] = {nullptr};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_FP32_BASE_H_
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.cc
new file mode 100644
index 00000000..476d5940
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.cc
@@ -0,0 +1,266 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/cpu/fp32/lstm_mindir_fp32.h"
+#include "nnacl/fp32/pack_fp32.h"
+
+namespace mindspore::kernel {
+namespace {
+constexpr int kInputGateIndex = 0;
+constexpr int kTempHiddenOutputIndex = 8;
+constexpr int kGateNum = 4;
+constexpr int kWeightsIndex = 3;
+const int kWeightsOrderMap[8] = {0, 2, 3, 1, 4, 6, 7, 5};  // IFGO order to IOFG order
+}  // namespace
+
+int LstmMindirFp32CPUKernel::ReSize() {
+  auto ret = LstmFp32BaseCPUKernel::ReSize();
+  if (ret != lite::RET_OK) {
+    MS_LOG(ERROR) << "LstmMindirFp32CPUKernel resize failed.";
+    return ret;
+  }
+  // determine FB origin
+  gpu_orig_state_ = false;
+  auto weight_t = in_tensors_.at(kWeightsIndex);
+  MS_CHECK_INT_MUL_NOT_OVERFLOW(lstm_param_->hidden_size_, lstm_param_->input_size_, lite::RET_ERROR);
+  int hi_unit_size = lstm_param_->hidden_size_ * lstm_param_->input_size_;
+  MS_CHECK_INT_MUL_NOT_OVERFLOW(weight_segment_num_, hi_unit_size, lite::RET_ERROR);
+  int hi_whole_size = weight_segment_num_ * hi_unit_size;
+  MS_CHECK_INT_MUL_NOT_OVERFLOW(lstm_param_->hidden_size_, lstm_param_->output_size_, lite::RET_ERROR);
+  int hh_unit_size = lstm_param_->hidden_size_ * lstm_param_->output_size_;
+  MS_CHECK_INT_MUL_NOT_OVERFLOW(weight_segment_num_, hh_unit_size, lite::RET_ERROR);
+  int hh_whole_size = weight_segment_num_ * hh_unit_size;
+  int scale = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
+  MS_CHECK_INT_MUL_NOT_OVERFLOW(lstm_param_->hidden_size_, lstm_param_->project_size_, lite::RET_ERROR);
+  int hp_unit_size = lstm_param_->hidden_size_ * lstm_param_->project_size_;
+  MS_CHECK_INT_MUL_NOT_OVERFLOW(scale, hp_unit_size, lite::RET_ERROR);
+  int hp_whole_size = scale * hp_unit_size;
+  MS_CHECK_INT_MUL_NOT_OVERFLOW(weight_segment_num_ * C2NUM, lstm_param_->hidden_size_, lite::RET_ERROR);
+  int bias_whole_size = weight_segment_num_ * C2NUM * lstm_param_->hidden_size_;
+  auto whole_size = weight_t->ElementsNum();
+  bool has_bias = (hi_whole_size + hh_whole_size + hp_whole_size < whole_size) ? true : false;
+  // if bias exist we can determine the gpu_orig_state_
+  if (has_bias) {
+    gpu_orig_state_ = (hi_whole_size + hh_whole_size + hp_whole_size + bias_whole_size == whole_size) ? true : false;
+  } else {
+    bias_whole_size = 0;
+  }
+  if (gpu_orig_state_) {
+    return lite::RET_OK;
+  }
+  bias_whole_size /= C2NUM;
+  if (hi_whole_size + hh_whole_size + hp_whole_size + bias_whole_size != whole_size) {
+    MS_LOG(ERROR) << "LstmMindir is invalid when original model exports from CPU.";
+    return lite::RET_INPUT_TENSOR_ERROR;
+  }
+  return lite::RET_OK;
+}
+
+int LstmMindirFp32CPUKernel::InitInputWeightBias() {
+  // malloc and init input * weight right matrix buffer
+  // input -- row: seq_len * batch; col: input_size
+  // weight -- row: hidden_size; col: input_size, need transpose
+  // result -- row: seq_len * batch; col: hidden_size
+  weight_i_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(
+    weight_segment_num_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float)));
+  MS_CHECK_TRUE_MSG(weight_i_ptr_ != nullptr, lite::RET_NULL_PTR, "LstmMindirCPUKernel malloc weight_i_ptr_ failed.");
+  running_buffer_.push_back(weight_i_ptr_);
+  auto weight_data = reinterpret_cast<float *>(in_tensors_.at(kWeightsIndex)->data());
+  CHECK_NULL_RETURN(weight_data);
+
+  int hi_unit_size = lstm_param_->input_size_ * lstm_param_->hidden_size_;
+  int hh_unit_size = lstm_param_->hidden_size_ * lstm_param_->output_size_;
+  int stride = (gpu_orig_state_) ? kGateNum * (hi_unit_size + hh_unit_size) : kGateNum * hi_unit_size;
+  PackLstmWeightWithStride(weight_i_ptr_, weight_data, weight_segment_num_, lstm_param_->input_size_,
+                           lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_,
+                           stride, kWeightsOrderMap);
+  // input bias
+  auto bias_size = weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float);
+  input_bias_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(bias_size));
+  MS_CHECK_TRUE_MSG(input_bias_ != nullptr, lite::RET_NULL_PTR, "LstmMindirCPUKernel malloc input_bias_ failed.");
+  memset(input_bias_, 0, bias_size);
+  running_buffer_.push_back(input_bias_);
+  if (!lstm_param_->has_bias_) {
+    return RET_OK;
+  }
+  int scale = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
+  int offset = weight_segment_num_ * (hi_unit_size + hh_unit_size) +
+               scale * lstm_param_->project_size_ * lstm_param_->hidden_size_;
+  float *bias_data = weight_data + offset;
+  int b_stride =
+    (gpu_orig_state_) ? kGateNum * (scale * lstm_param_->hidden_size_) : kGateNum * (lstm_param_->hidden_size_);
+  PackLstmBiasWithStride(input_bias_, bias_data, weight_segment_num_, lstm_param_->hidden_size_,
+                         lstm_param_->input_col_align_, lstm_param_->bidirectional_, b_stride, kWeightsOrderMap);
+  return RET_OK;
+}
+
+int LstmMindirFp32CPUKernel::InitStateWeightBias() {
+  // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times.
+  // state -- row: batch; col: hidden_size
+  // weight -- row: hidden_size; col: hidden_size, need transpose
+  // result -- row: batch; col: hidden_size
+  auto weight_data = (reinterpret_cast<float *>(in_tensors_.at(kWeightsIndex)->data()));
+  CHECK_NULL_RETURN(weight_data);
+
+  int hi_unit_size = lstm_param_->input_size_ * lstm_param_->hidden_size_;
+  int hh_unit_size = lstm_param_->hidden_size_ * lstm_param_->output_size_;
+  int stride = (gpu_orig_state_) ? kGateNum * (hi_unit_size + hh_unit_size) : kGateNum * hh_unit_size;
+
+  auto weight_h_data = weight_data + (gpu_orig_state_ ? kGateNum * hi_unit_size : weight_segment_num_ * hi_unit_size);
+
+  auto weight_unit_pack_size = sizeof(float) * lstm_param_->state_col_align_ * lstm_param_->output_size_;
+  auto weight_pack_size = weight_segment_num_ * weight_unit_pack_size;
+  weight_h_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(weight_pack_size));
+  MS_CHECK_TRUE_MSG(weight_h_ptr_ != nullptr, lite::RET_NULL_PTR, "LstmMindirCPUKernel malloc weight_h_ptr_ failed.");
+  running_buffer_.push_back(weight_h_ptr_);
+  if (lstm_param_->batch_ != 1) {
+    PackLstmWeightWithStride(weight_h_ptr_, weight_h_data, weight_segment_num_, lstm_param_->output_size_,
+                             lstm_param_->hidden_size_, lstm_param_->state_col_align_, lstm_param_->bidirectional_,
+                             stride, kWeightsOrderMap);
+  } else {
+    for (int i = 0; i < weight_segment_num_; i++) {
+      const float *src_batch = weight_h_data + i * lstm_param_->hidden_size_ * lstm_param_->output_size_;
+      float *dst_batch =
+        weight_h_ptr_ + kWeightsOrderMap[i] * lstm_param_->state_col_align_ * lstm_param_->output_size_;
+#ifdef ENABLE_AVX
+      RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->hidden_size_, lstm_param_->output_size_);
+#else
+      (void)memcpy(dst_batch, src_batch, weight_unit_pack_size);
+#endif
+    }
+  }
+
+  // state bias
+  auto bias_pack_size = weight_segment_num_ * lstm_param_->state_col_align_ * sizeof(float);
+  state_bias_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(bias_pack_size));
+  MS_CHECK_TRUE_MSG(state_bias_ != nullptr, lite::RET_NULL_PTR, "LstmMindirCPUKernel malloc state_bias_ failed.");
+  memset(state_bias_, 0, bias_pack_size);
+  running_buffer_.push_back(state_bias_);
+  if (!lstm_param_->has_bias_ || !gpu_orig_state_) {
+    return RET_OK;
+  }
+
+  int hi_whole_size = weight_segment_num_ * lstm_param_->hidden_size_ * lstm_param_->input_size_;
+  int hh_whole_size = weight_segment_num_ * lstm_param_->hidden_size_ * lstm_param_->output_size_;
+  int proj_size =
+    (lstm_param_->bidirectional_ ? C2NUM : C1NUM) * lstm_param_->project_size_ * lstm_param_->hidden_size_;
+  // mindir from device "GPU", secend bias is also present order IFOG
+  int bias_offset = hi_whole_size + hh_whole_size + proj_size + lstm_param_->hidden_size_ * kGateNum;
+  float *state_bias = weight_data + bias_offset;
+  int b_stride = kGateNum * lstm_param_->hidden_size_ * C2NUM;
+  PackLstmBiasWithStride(state_bias_, state_bias, weight_segment_num_, lstm_param_->hidden_size_,
+                         lstm_param_->state_col_align_, lstm_param_->bidirectional_, b_stride, kWeightsOrderMap);
+  return RET_OK;
+}
+
+int LstmMindirFp32CPUKernel::InitProjectWeight() {
+  if (lstm_param_->project_size_ == 0) {
+    return RET_OK;
+  }
+  auto weight_data = (reinterpret_cast<float *>(in_tensors_.at(kWeightsIndex)->data()));
+  CHECK_NULL_RETURN(weight_data);
+  int hi_whole_size = weight_segment_num_ * lstm_param_->hidden_size_ * lstm_param_->input_size_;
+  int hh_whole_size = weight_segment_num_ * lstm_param_->hidden_size_ * lstm_param_->output_size_;
+  auto weight_proj_data = weight_data + hi_whole_size + hh_whole_size;
+  int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
+  auto pack_size = batch * lstm_param_->hidden_size_ * lstm_param_->proj_col_align_ * sizeof(float);
+  if (lstm_param_->batch_ != 1) {
+    weight_project_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(pack_size));
+    MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR,
+                      "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed.");
+    running_buffer_.push_back(weight_project_ptr_);
+    PackLstmWeightWithStride(weight_project_ptr_, weight_proj_data, batch, lstm_param_->hidden_size_,
+                             lstm_param_->output_size_, lstm_param_->proj_col_align_, lstm_param_->bidirectional_,
+                             lstm_param_->hidden_size_ * lstm_param_->output_size_, nullptr);
+  } else {
+#ifdef ENABLE_AVX
+    weight_project_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(pack_size));
+    MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR,
+                      "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed.");
+    running_buffer_.push_back(weight_project_ptr_);
+    for (int i = 0; i < batch; ++i) {
+      const float *src_batch = weight_proj_data + i * lstm_param_->hidden_size_ * lstm_param_->output_size_;
+      float *dst_batch = weight_project_ptr_ + i * lstm_param_->hidden_size_ * lstm_param_->proj_col_align_;
+      RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->output_size_, lstm_param_->hidden_size_);
+    }
+#else
+    weight_project_ptr_ = weight_proj_data;
+#endif
+  }
+  return RET_OK;
+}
+
+void LstmMindirFp32CPUKernel::LstmUnidirectional(float *output, const float *weight_h, const float *state_bias,
+                                                 float *hidden_state, float *cell_state, const float *weight_project,
+                                                 float *intermediate_states, float **buffer, bool is_backward) {
+  float *gate = buffer[kInputGateIndex];
+  float *input_gate = gate;
+  float *forget_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C2NUM;
+  float *cell_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C3NUM;
+  float *output_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_;
+  float *tmp = buffer[kTempHiddenOutputIndex];
+  int dir_mult = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
+  for (int t = 0; t < lstm_param_->seq_len_; t++) {
+    int real_t = is_backward ? lstm_param_->seq_len_ - t - C1NUM : t;
+    float *input_gate_t = input_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
+    float *forget_gate_t = forget_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
+    float *cell_gate_t = cell_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
+    float *output_gate_t = output_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
+    // Sequence, Batch, DirMul, Hidden
+    LstmStepUnit(tmp, input_gate_t, forget_gate_t, cell_gate_t, output_gate_t, weight_h, state_bias, weight_project,
+                 hidden_state, cell_state, buffer, lstm_param_);
+    int seq_offset = real_t * lstm_param_->batch_ * dir_mult * lstm_param_->output_size_;
+    for (int b = 0; b < lstm_param_->batch_; b++) {
+      int batch_offset = b * dir_mult * lstm_param_->output_size_;
+      float *output_ptr = output + seq_offset + batch_offset;
+      memcpy(output_ptr, tmp + b * lstm_param_->output_size_, lstm_param_->output_size_ * sizeof(float));
+    }
+    if (intermediate_states) {
+      RecordStates(hidden_state, cell_state, input_gate_t, output_gate_t, forget_gate_t, cell_gate_t,
+                   intermediate_states, real_t);
+    }
+  }
+}
+
+void LstmMindirFp32CPUKernel::RecordStates(const float *hidden_state, float *cell_state, float *input_gate,
+                                           const float *output_gate, float *forget_gate, const float *cell_gate,
+                                           float *intermediate_states, int step) {
+  float *states = intermediate_states;
+  auto hidden_size = lstm_param_->batch_ * lstm_param_->output_size_;
+  auto state_size = lstm_param_->batch_ * lstm_param_->hidden_size_;
+  if (state_size < 0) {
+    MS_LOG(ERROR) << "state size should be greater than or equal to zero.";
+    return;
+  }
+  auto hidden_stride = step * lstm_param_->output_step_;
+  auto hidden_seq_stride = lstm_param_->seq_len_ * lstm_param_->output_step_;
+  auto other_output_step = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->hidden_size_
+                                                       : lstm_param_->batch_ * lstm_param_->hidden_size_;
+  auto stride = step * other_output_step;
+  auto seq_stride = lstm_param_->seq_len_ * other_output_step;
+  memcpy(states + hidden_stride, hidden_state, hidden_size * sizeof(float));
+  stride += hidden_seq_stride;
+  memcpy(states + stride, cell_state, state_size * sizeof(float));
+  stride += seq_stride;
+  memcpy(states + stride, input_gate, state_size * sizeof(float));
+  stride += seq_stride;
+  memcpy(states + stride, output_gate, state_size * sizeof(float));
+  stride += seq_stride;
+  memcpy(states + stride, forget_gate, state_size * sizeof(float));
+  stride += seq_stride;
+  memcpy(states + stride, cell_gate, state_size * sizeof(float));
+}
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.h
new file mode 100644
index 00000000..84cdd38e
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_MINDIR_FP32_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_MINDIR_FP32_H_
+
+#include <vector>
+#include "src/litert/kernel/cpu/fp32/lstm_fp32_base.h"
+
+namespace mindspore::kernel {
+/*
+ * 1. LSTM without project, output_size = hidden_size
+ *    h_init: second input, shape is [bidirectional, batch_size, hidden_size]
+ *    c_init: third input, shape is [bidirectional, batch_size, hidden_size]
+ *    weight_bias: forth input, weight_ih + weight_hh + bias, the gate order is IFGO
+ *
+ * 2. LSTM with project, output_size = project_size
+ *    h_init: second input, shape is [bidirectional, batch_size, project_size]
+ *    c_init: third input, shape is [bidirectional, batch_size, hidden_size]
+ *    weight_bias: forth input, weight_ih + weight_hh + proj + bias, the gate order is IFGO
+ */
+class LstmMindirFp32CPUKernel : public LstmFp32BaseCPUKernel {
+ public:
+  LstmMindirFp32CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                          const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : LstmFp32BaseCPUKernel(parameter, inputs, outputs, ctx) {
+    hidden_init_index_ = SECOND_INPUT;
+    cell_init_index_ = THIRD_INPUT;
+  }
+
+  ~LstmMindirFp32CPUKernel() override = default;
+
+  int ReSize() override;
+
+ protected:
+  int InitInputWeightBias() override;
+  int InitStateWeightBias() override;
+  int InitProjectWeight() override;
+  void LstmUnidirectional(float *output, const float *weight_h, const float *state_bias, float *hidden_state,
+                          float *cell_state, const float *weight_project, float *intermediate_states, float *buffer[],
+                          bool is_backward) override;
+
+ private:
+  void RecordStates(const float *hidden_state, float *cell_state, float *input_gate, const float *output_gate,
+                    float *forget_gate, const float *cell_gate, float *intermediate_states, int step);
+  bool gpu_orig_state_{false};
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_MINDIR_FP32_H_
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.cc
new file mode 100644
index 00000000..62f9f2b7
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.cc
@@ -0,0 +1,173 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h"
+#include "nnacl/fp32/pack_fp32.h"
+
+namespace mindspore::kernel {
+namespace {
+constexpr int kInputGateIndex = 0;
+constexpr int kGateNum = 4;
+constexpr int kWeightInputIndex = 1;
+constexpr int kWeightHiddenindex = 2;
+constexpr int kCombinedBiasIndex = 3;
+}  // namespace
+
+int LstmNonMindirFp32CPUKernel::InitInputWeightBias() {
+  // malloc and init input * weight right matrix buffer
+  // input -- row: seq_len * batch; col: input_size
+  // weight -- row: hidden_size; col: input_size, need transpose
+  // result -- row: seq_len * batch; col: hidden_size
+  weight_i_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(
+    weight_segment_num_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float)));
+  MS_CHECK_TRUE_MSG(weight_i_ptr_ != nullptr, lite::RET_NULL_PTR,
+                    "LstmNonMindirCPUKernel malloc weight_i_ptr_ failed.");
+  running_buffer_.push_back(weight_i_ptr_);
+  auto weight_i = in_tensors_.at(kWeightInputIndex);
+  auto weight_i_data = reinterpret_cast<float *>(weight_i->data());
+  CHECK_NULL_RETURN(weight_i_data);
+
+  int stride = kGateNum * lstm_param_->input_size_ * lstm_param_->hidden_size_;
+  PackLstmWeightWithStride(weight_i_ptr_, weight_i_data, weight_segment_num_, lstm_param_->input_size_,
+                           lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_,
+                           stride, nullptr);
+  // input bias
+  input_bias_ = reinterpret_cast<float *>(
+    ms_context_->allocator->Malloc(weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float)));
+  MS_CHECK_TRUE_MSG(input_bias_ != nullptr, lite::RET_NULL_PTR, "LstmNonMindirCPUKernel malloc input_bias_ failed.");
+  memset(input_bias_, 0, weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float));
+  running_buffer_.push_back(input_bias_);
+  auto bias_data = reinterpret_cast<float *>(in_tensors_.at(kCombinedBiasIndex)->data());
+  CHECK_NULL_RETURN(bias_data);
+  PackLstmBias(input_bias_, bias_data, weight_segment_num_, lstm_param_->hidden_size_, lstm_param_->input_col_align_,
+               lstm_param_->bidirectional_, nullptr);
+  return RET_OK;
+}
+
+int LstmNonMindirFp32CPUKernel::InitStateWeightBias() {
+  // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times.
+  // state -- row: batch; col: hidden_size
+  // weight -- row: hidden_size; col: hidden_size, need transpose
+  // result -- row: batch; col: hidden_size
+  auto weight_h = in_tensors_.at(kWeightHiddenindex);
+  auto weight_h_data = reinterpret_cast<float *>(weight_h->data());
+  CHECK_NULL_RETURN(weight_h_data);
+
+  int stride = kGateNum * lstm_param_->hidden_size_ * lstm_param_->output_size_;
+  auto weight_pack_size =
+    weight_segment_num_ * lstm_param_->state_col_align_ * lstm_param_->output_size_ * sizeof(float);
+  if (lstm_param_->batch_ != 1) {
+    weight_h_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(weight_pack_size));
+    MS_CHECK_TRUE_MSG(weight_h_ptr_ != nullptr, lite::RET_NULL_PTR,
+                      "LstmNonMindirCPUKernel malloc weight_h_ptr_ failed.");
+    running_buffer_.push_back(weight_h_ptr_);
+    PackLstmWeightWithStride(weight_h_ptr_, weight_h_data, weight_segment_num_, lstm_param_->output_size_,
+                             lstm_param_->hidden_size_, lstm_param_->state_col_align_, lstm_param_->bidirectional_,
+                             stride, nullptr);
+  } else {
+#ifdef ENABLE_AVX
+    weight_h_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(weight_pack_size));
+    MS_CHECK_TRUE_MSG(weight_h_ptr_ != nullptr, lite::RET_NULL_PTR,
+                      "LstmNonMindirCPUKernel malloc weight_h_ptr_ failed.");
+    running_buffer_.push_back(weight_h_ptr_);
+    for (int i = 0; i < weight_segment_num_; i++) {
+      const float *src_batch = weight_h_data + i * lstm_param_->hidden_size_ * lstm_param_->output_size_;
+      float *dst_batch = weight_h_ptr_ + i * lstm_param_->state_col_align_ * lstm_param_->output_size_;
+      RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->hidden_size_, lstm_param_->output_size_);
+    }
+#else
+    weight_h_ptr_ = weight_h_data;
+#endif
+  }
+
+  // state bias
+  auto bias_pack_size = weight_segment_num_ * lstm_param_->state_col_align_ * sizeof(float);
+  state_bias_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(bias_pack_size));
+  MS_CHECK_TRUE_MSG(state_bias_ != nullptr, lite::RET_NULL_PTR, "LstmNonMindirCPUKernel malloc state_bias_ failed.");
+  memset(state_bias_, 0, bias_pack_size);
+  running_buffer_.push_back(state_bias_);
+  // if ONNX, secend bias is also present order IOFG
+  auto bias_data = reinterpret_cast<float *>(in_tensors_.at(kCombinedBiasIndex)->data());
+  CHECK_NULL_RETURN(bias_data);
+  auto *state_bias = bias_data + kGateNum * lstm_param_->hidden_size_;
+  PackLstmBias(state_bias_, state_bias, weight_segment_num_, lstm_param_->hidden_size_, lstm_param_->state_col_align_,
+               lstm_param_->bidirectional_, nullptr);
+  return RET_OK;
+}
+
+int LstmNonMindirFp32CPUKernel::InitProjectWeight() {
+  if (in_tensors_.size() < C7NUM) {
+    return RET_OK;
+  }
+  auto weight_pro = in_tensors_.at(SEVENTH_INPUT);
+  auto shape = weight_pro->shape();
+  MS_CHECK_TRUE_MSG(shape.size() == C3NUM, lite::RET_ERROR, "Project-weight's shape must be 3D.");
+  auto weight_pro_data = reinterpret_cast<float *>(weight_pro->data());
+  CHECK_NULL_RETURN(weight_pro_data);
+  int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
+  if (shape[0] != batch) {
+    MS_LOG(ERROR) << "Project-weight's shape[0] must be 1(bidirectional=false) or 2(bidirectional=true).";
+    return lite::RET_ERROR;
+  }
+  int col_align = UP_ROUND(lstm_param_->output_size_, col_tile_);
+  auto pack_size = batch * lstm_param_->hidden_size_ * col_align * sizeof(float);
+  if (lstm_param_->batch_ != 1) {
+    weight_project_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(pack_size));
+    MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR,
+                      "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed.");
+    running_buffer_.push_back(weight_project_ptr_);
+    PackLstmWeightWithStride(weight_project_ptr_, weight_pro_data, batch, lstm_param_->hidden_size_,
+                             lstm_param_->output_size_, col_align, lstm_param_->bidirectional_,
+                             lstm_param_->hidden_size_ * lstm_param_->output_size_, nullptr);
+  } else {
+#ifdef ENABLE_AVX
+    weight_project_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(pack_size));
+    MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR,
+                      "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed.");
+    running_buffer_.push_back(weight_project_ptr_);
+    for (int i = 0; i < batch; ++i) {
+      const float *src_batch = weight_pro_data + i * lstm_param_->hidden_size_ * lstm_param_->output_size_;
+      float *dst_batch = weight_project_ptr_ + i * lstm_param_->hidden_size_ * col_align;
+      RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->output_size_, lstm_param_->hidden_size_);
+    }
+#else
+    weight_project_ptr_ = weight_pro_data;
+#endif
+  }
+  return RET_OK;
+}
+
+void LstmNonMindirFp32CPUKernel::LstmUnidirectional(float *output, const float *weight_h, const float *state_bias,
+                                                    float *hidden_state, float *cell_state, const float *weight_project,
+                                                    float *intermediate_states, float **buffer, bool is_backward) {
+  float *gate = buffer[kInputGateIndex];
+  float *input_gate = gate;
+  float *forget_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C2NUM;
+  float *cell_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C3NUM;
+  float *output_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_;
+  for (int t = 0; t < lstm_param_->seq_len_; t++) {
+    int real_t = is_backward ? lstm_param_->seq_len_ - t - C1NUM : t;
+    float *input_gate_t = input_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
+    float *forget_gate_t = forget_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
+    float *cell_gate_t = cell_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
+    float *output_gate_t = output_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t;
+    // Sequence, DirMul, Batch, Hidden
+    float *output_ptr = output + real_t * lstm_param_->output_step_;
+    LstmStepUnit(output_ptr, input_gate_t, forget_gate_t, cell_gate_t, output_gate_t, weight_h, state_bias,
+                 weight_project, hidden_state, cell_state, buffer, lstm_param_);
+  }
+}
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h
new file mode 100644
index 00000000..b16e9175
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_NON_MINDIR_FP32_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_NON_MINDIR_FP32_H_
+
+#include <vector>
+#include "src/litert/kernel/cpu/fp32/lstm_fp32_base.h"
+
+namespace mindspore::kernel {
+/*
+ * 1. LSTM without project, output_size = hidden_size
+ *    weight_ih: second input, shape is [bidirectional, 4 * hidden_size, input_size]
+ *    weight_hh: third input, shape is [bidirectional, 4 * hidden_size, hidden_size]
+ *    bias: forth input, shape is [bidirectional, 8 * hidden_size]
+ *    h_init: fifth input, shape is [bidirectional, batch_size, hidden_size]
+ *    c_init: sixth input, shape is [bidirectional, batch_size, hidden_size]
+ *
+ * 2. LSTM with project, output_size = project_size
+ *    weight_ih: second input, shape is [bidirectional, 4 * hidden_size, input_size]
+ *    weight_hh: third input, shape is [bidirectional, 4 * hidden_size, project_size]
+ *    bias: forth input, shape is [bidirectional, 8 * hidden_size]
+ *    h_init: fifth input, shape is [bidirectional, batch_size, project_size]
+ *    c_init: sixth input, shape is [bidirectional, batch_size, hidden_size]
+ *    weight_pro: seventh input, shape is [bidirectional, project_size, hidden_size]
+ */
+class LstmNonMindirFp32CPUKernel : public LstmFp32BaseCPUKernel {
+ public:
+  LstmNonMindirFp32CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                             const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : LstmFp32BaseCPUKernel(parameter, inputs, outputs, ctx) {
+    hidden_init_index_ = FIFTH_INPUT;
+    cell_init_index_ = SIXTH_INPUT;
+  }
+
+  ~LstmNonMindirFp32CPUKernel() override = default;
+
+ protected:
+  int InitInputWeightBias() override;
+  int InitStateWeightBias() override;
+  int InitProjectWeight() override;
+  void LstmUnidirectional(float *output, const float *weight_h, const float *state_bias, float *hidden_state,
+                          float *cell_state, const float *weight_project, float *intermediate_states, float *buffer[],
+                          bool is_backward) override;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_NON_MINDIR_FP32_H_
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.cc
new file mode 100644
index 00000000..60d3f213
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.cc
@@ -0,0 +1,147 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.h"
+#include "src/litert//kernel_registry.h"
+#include "include/errorcode.h"
+#include "src/common/log_adapter.h"
+#include "nnacl/custom_gather_d_grad_v2_parameter.h"
+
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_NOT_SUPPORT;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+namespace {
+constexpr size_t index_idx_{1};
+constexpr size_t grad_idx_{2};
+size_t get_element_num(const std::vector<int> &shape) {
+  return std::accumulate(shape.begin(), shape.end(), static_cast<std::size_t>(1), std::multiplies<int>());
+}
+
+void GatherDGradCopyTask(size_t cur, std::vector<size_t> *pos, float *input, int *index, const int &dim, float *output,
+                         const std::vector<int> &output_shape, const std::vector<size_t> &out_cargo_size,
+                         const std::vector<size_t> &input_cargo_size) {
+  for (int i = 0; i < output_shape[cur]; ++i) {
+    (*pos)[cur] = i;
+    if (cur == output_shape.size() - 1) {
+      int input_offset = 0;
+      int out_offset = 0;
+      // out offset
+      for (size_t j = 0; j < output_shape.size(); ++j) {
+        out_offset += (*pos)[j] * out_cargo_size[j];
+      }
+      // input offset
+      int cur_index = (*pos)[dim];
+      (*pos)[dim] = index[out_offset];
+      for (size_t j = 0; j < output_shape.size(); ++j) {
+        input_offset += (*pos)[j] * input_cargo_size[j];
+      }
+      // do copy
+      input[input_offset] += output[out_offset];
+      (*pos)[dim] = cur_index;
+    } else {
+      // CopyTask
+      GatherDGradCopyTask(cur + 1, pos, input, index, dim, output, output_shape, out_cargo_size, input_cargo_size);
+    }
+  }
+}
+}  // namespace
+
+CustomGatherDGradV2CPUKernel::~CustomGatherDGradV2CPUKernel() {}
+
+int CustomGatherDGradV2CPUKernel::Prepare() {
+  CHECK_LESS_RETURN(in_tensors_.size(), C3NUM);
+  CHECK_LESS_RETURN(out_tensors_.size(), C1NUM);
+  if (InitParamter() != RET_OK) {
+    MS_LOG(ERROR) << "Init Built-in CustomGatherGradV2 Parameter failed." << name_;
+    return RET_ERROR;
+  }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int CustomGatherDGradV2CPUKernel::InitParamter() {
+  auto param = reinterpret_cast<CustomGatherGradV2Parameter *>(op_parameter_);
+  axis_ = param->dim;
+  return RET_OK;
+}
+
+int CustomGatherDGradV2CPUKernel::ReSize() {
+  index_shape_ = in_tensors_[index_idx_]->shape();
+  grad_shape_ = in_tensors_[grad_idx_]->shape();
+  output_shape_ = out_tensors_[0]->shape();
+  if (grad_shape_.size() != index_shape_.size() || output_shape_.size() != index_shape_.size()) {
+    MS_LOG(ERROR) << "For '" << name_ << "', the dimension of grad and output must be the equal to the "
+                  << "dimension of index: " << index_shape_.size()
+                  << ", but got the dimension of grad: " << grad_shape_.size()
+                  << ", the dimension of output: " << output_shape_.size();
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+
+int CustomGatherDGradV2CPUKernel::Run() {
+  auto *index = reinterpret_cast<int *>(in_tensors_[index_idx_]->data());
+  auto *grad = reinterpret_cast<float *>(in_tensors_[grad_idx_]->data());
+  auto out = reinterpret_cast<float *>(out_tensors_[0]->data());
+  int output_rank = output_shape_.size();
+  if (axis_ >= output_rank || axis_ < -output_rank) {
+    MS_LOG(ERROR) << "For '" << name_ << "', the value of 'dim' must be in [" << -output_rank << ", " << output_rank
+                  << "), but got: " << axis_;
+  }
+  if (axis_ < 0) {
+    axis_ = axis_ + output_rank;
+  }
+
+  // check index
+  size_t index_size = get_element_num(index_shape_);
+  int max_index = output_shape_[axis_];
+  for (size_t i = 0; i < index_size; ++i) {
+    if (index[i] >= max_index || index[i] < -max_index) {
+      MS_LOG(ERROR) << "For '" << name_ << "', the value of 'index' must be in [" << -max_index << ", " << max_index
+                    << "), but got: " << index[i];
+    }
+    if (index[i] < 0) {
+      index[i] = max_index + index[i];
+    }
+  }
+  auto out_size = get_element_num(output_shape_);
+  memset(out, 0, out_size * sizeof(float));
+
+  // out_cargo_size
+  std::vector<size_t> out_cargo_size = std::vector<size_t>(output_shape_.size(), 1);
+  for (int i = static_cast<int>(out_cargo_size.size()) - 2; i >= 0; --i) {
+    out_cargo_size[i] = output_shape_[i + 1] * out_cargo_size[i + 1];
+  }
+  // grad_cargo_size
+  std::vector<size_t> grad_cargo_size = std::vector<size_t>(grad_shape_.size(), 1);
+  for (int i = static_cast<int>(grad_cargo_size.size()) - 2; i >= 0; --i) {
+    grad_cargo_size[i] = grad_shape_[i + 1] * grad_cargo_size[i + 1];
+  }
+
+  // copy task
+  std::vector<size_t> pos(index_shape_.size(), 0);
+  GatherDGradCopyTask(0, &pos, out, index, axis_, grad, index_shape_, grad_cargo_size, out_cargo_size);
+  return RET_OK;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimType_Inner_CustomGatherDGradV2,
+           LiteKernelCreator<CustomGatherDGradV2CPUKernel>)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.h
new file mode 100644
index 00000000..25666023
--- /dev/null
+++ b/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_GRAD_CUSTOM_GATHER_D_GRAD_V2_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_GRAD_CUSTOM_GATHER_D_GRAD_V2_H_
+#include <vector>
+#include "src/litert/lite_kernel.h"
+
+namespace mindspore::kernel {
+class CustomGatherDGradV2CPUKernel : public LiteKernel {
+ public:
+  CustomGatherDGradV2CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                               const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : LiteKernel(parameter, inputs, outputs, ctx) {}
+  ~CustomGatherDGradV2CPUKernel() override;
+  int Prepare() override;
+  int ReSize() override;
+  int Run() override;
+
+ private:
+  int InitParamter();
+
+  std::vector<int> index_shape_;
+  std::vector<int> grad_shape_;
+  std::vector<int> output_shape_;
+  int axis_{0};
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_GRAD_CUSTOM_GATHER_D_GRAD_V2_H_
diff --git a/mindspore/lite/src/train/graph_fusion.cc b/mindspore/lite/src/train/graph_fusion.cc
index 48c037b2..7982f818 100644
--- a/mindspore/lite/src/train/graph_fusion.cc
+++ b/mindspore/lite/src/train/graph_fusion.cc
@@ -25,6 +25,8 @@
 #include "src/train/optimizer/fusion/reshape_gather_reshape_fusion_pass.h"
 #include "tools/converter/legacy_optimizer/graph/isolated_node_remove_pass.h"
 #include "tools/converter/legacy_optimizer/graph/subgraph_node_pass.h"
+#include "src/train/optimizer/fusion/matmul_add_fusion_pass.h"
+#include "src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h"

 namespace mindspore {
 namespace lite {
@@ -52,7 +54,9 @@ STATUS GraphFusion::Run(schema::MetaGraphT *graph) {
   Optimizer fusion_optimizer;
   fusion_optimizer.AddPass(new (std::nothrow) ReshapeGatherReshapeFusionPass());
   fusion_optimizer.AddPass(new (std::nothrow) MatMulBiasAddFusionPass());
+  fusion_optimizer.AddPass(new (std::nothrow) MatMulAddFusionPass());
   fusion_optimizer.AddPass(new (std::nothrow) MatMulActivationFusionPass());
+  fusion_optimizer.AddPass(new (std::nothrow) MatMulMatMulAddFusionPass());
   fusion_optimizer.AddPass(new (std::nothrow) IsolatedNodeRemovePass());
   fusion_optimizer.AddPass(new (std::nothrow) SubgraphNodePass(old_nodes));
   auto status = fusion_optimizer.Run(graph);
diff --git a/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.cc b/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.cc
new file mode 100644
index 00000000..34bed911
--- /dev/null
+++ b/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.cc
@@ -0,0 +1,127 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/train/optimizer/fusion/matmul_add_fusion_pass.h"
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <memory>
+#include "schema/inner/model_generated.h"
+#include "tools/common/meta_graph_utils.h"
+namespace {
+constexpr int kNumAddMatchPathLen = 2;
+constexpr std::string_view MulName = "MATMUL";
+constexpr std::string_view AddName = "ADD";
+}  // namespace
+namespace mindspore {
+namespace lite {
+namespace {
+int CalNewCnodeBias(const std::unique_ptr<mindspore::schema::TensorT> &add_weight_tensor,
+                    const std::unique_ptr<mindspore::schema::TensorT> &matmul_bias_tensor) {
+  if (add_weight_tensor->dataType != kNumberTypeFloat32 || matmul_bias_tensor->dataType != kNumberTypeFloat32) {
+    MS_LOG(INFO) << "only support float32 data type";
+    return RET_ERROR;
+  }
+  std::vector<int32_t> matmul_bias_shape = matmul_bias_tensor->dims;
+  std::vector<int32_t> add_weight_shape = add_weight_tensor->dims;
+  MS_CHECK_TRUE_RET(matmul_bias_shape == add_weight_shape, RET_ERROR);
+  auto add_weight_data = reinterpret_cast<float *>(add_weight_tensor->data.data());
+  auto matmul_bias_data = reinterpret_cast<float *>(matmul_bias_tensor->data.data());
+  int num = static_cast<int>(matmul_bias_tensor->data.size() / sizeof(float));
+  for (int i = 0; i < num; ++i) {
+    matmul_bias_data[i] += add_weight_data[i];
+  }
+  return RET_OK;
+}
+}  // namespace
+STATUS MatMulAddFusionPass::Run(MetaGraphT *graph) { return FusionPass::Run(graph); }
+STATUS MatMulAddFusionPass::DefinePattern() {
+  auto mul_op = std::make_shared<PatternOp>();
+  MS_CHECK_TRUE_RET(mul_op != nullptr, RET_NULL_PTR);
+  mul_op->id = MulName;
+  mul_op->types = {schema::PrimitiveType_MatMulFusion};
+  auto add_op = std::make_shared<PatternOp>();
+  MS_CHECK_TRUE_RET(add_op != nullptr, RET_NULL_PTR);
+  add_op->id = AddName;
+  add_op->types = {schema::PrimitiveType_AddFusion};
+  add_op->left = mul_op;
+  std::unique_ptr<FusionPattern> fusion_pattern(new (std::nothrow) FusionPattern("MatMulAddFusion"));
+  if (fusion_pattern == nullptr) {
+    MS_LOG(ERROR) << "new fusion_pattern failed";
+    return RET_ERROR;
+  }
+  fusion_pattern->AddPatternOp(mul_op);
+  fusion_pattern->AddPatternOp(add_op);
+  fusion_pattern->Finish();
+  this->patterns.emplace_back(fusion_pattern.release());
+  return RET_OK;
+}
+STATUS MatMulAddFusionPass::DoFusion(MetaGraphT *graph, const std::string &pattern_name,
+                                     const std::unordered_map<std::string, std::shared_ptr<Path>> &matched_path) {
+  MS_CHECK_TRUE_RET(graph != nullptr, RET_NULL_PTR);
+  if (matched_path.size() != kNumAddMatchPathLen) {
+    MS_LOG(ERROR) << "MatMul-Add-Fusion should have two NodeIndex in matchedPair";
+    return RET_PARAM_INVALID;
+  }
+  auto mul_path_iter = matched_path.find(std::string(MulName));
+  MS_CHECK_TRUE_RET(mul_path_iter != matched_path.end(), RET_NO_CHANGE);
+  auto &mul_path = mul_path_iter->second;
+  MS_CHECK_TRUE_RET(mul_path != nullptr, RET_NULL_PTR);
+  auto add_path_iter = matched_path.find(std::string(AddName));
+  MS_CHECK_TRUE_RET(add_path_iter != matched_path.end(), RET_NO_CHANGE);
+  auto &add_path = add_path_iter->second;
+  MS_CHECK_TRUE_RET(add_path != nullptr, RET_NULL_PTR);
+  auto mul_index = mul_path->nodeIdx;
+  auto add_index = add_path->nodeIdx;
+  auto &mul_node = graph->nodes.at(mul_index);
+  MS_CHECK_TRUE_RET(mul_node != nullptr, RET_NULL_PTR);
+  auto &add_node = graph->nodes.at(add_index);
+  MS_CHECK_TRUE_RET(add_node != nullptr, RET_NULL_PTR);
+  if (mul_node->quantType == schema::QuantType_QUANT_ALL || mul_node->quantType == schema::QuantType_QUANT_DYNAMIC ||
+      add_node->quantType == schema::QuantType_QUANT_ALL || add_node->quantType == schema::QuantType_QUANT_DYNAMIC) {
+    MS_LOG(DEBUG) << "cannot fusion.";
+    return RET_NO_CHANGE;
+  }
+  MS_CHECK_TRUE_RET(mul_node->primitive != nullptr, RET_NULL_PTR);
+  auto matmul_type = mul_node->primitive->value.AsMatMulFusion();
+  MS_CHECK_TRUE_RET(matmul_type->activation_type == ActivationType::ActivationType_NO_ACTIVATION, RET_NO_CHANGE);
+  auto add_param_shape = graph->allTensors.at(add_node->inputIndex.at(SECOND_INPUT))->dims;
+  MS_CHECK_TRUE_MSG(add_param_shape.size() == DIMENSION_1D, RET_NO_CHANGE, "only support bias with shape size of 1.");
+  if (mul_node->inputIndex.size() == C3NUM) {
+    auto &mul_bias_tensor = graph->allTensors.at(mul_node->inputIndex.at(THIRD_INPUT));
+    if (mul_bias_tensor->data.data() == nullptr) {
+      MS_LOG(INFO) << mul_node->name << "'s bias is not const";
+      return RET_NO_CHANGE;
+    }
+    auto &add_weight_tensor = graph->allTensors.at(add_node->inputIndex.at(SECOND_INPUT));
+    if (CalNewCnodeBias(add_weight_tensor, mul_bias_tensor) != RET_OK) {
+      MS_LOG(INFO) << add_node->name << " failed to fusion with " << mul_node->name;
+      return RET_NO_CHANGE;
+    }
+  }
+  auto add_tensor_index = add_node->inputIndex.at(SECOND_INPUT);
+  if (mul_node->inputIndex.size() == C2NUM) {
+    mul_node->inputIndex.push_back(add_tensor_index);
+  }
+  mul_node->outputIndex = {add_node->outputIndex};
+  // cannot delete node here, otherwise will destroy order in other pattern's node index
+  // make it an isolated node to be removed in IsolatedNodeRemovePass
+  add_node->inputIndex.clear();
+  add_node->outputIndex.clear();
+  return RET_OK;
+}
+MatMulAddFusionPass::~MatMulAddFusionPass() = default;
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.h b/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.h
new file mode 100644
index 00000000..8eb4ab2e
--- /dev/null
+++ b/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_LEGACY_OPTIMIZER_FUSION_MATMUL_ADD_FUSION_PASS_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_LEGACY_OPTIMIZER_FUSION_MATMUL_ADD_FUSION_PASS_H_
+#include <string>
+#include <unordered_map>
+#include <memory>
+#include <algorithm>
+#include <utility>
+#include "tools/converter/legacy_optimizer/fusion/fusion_pass.h"
+namespace mindspore {
+namespace lite {
+class MatMulAddFusionPass : public FusionPass {
+ public:
+  MatMulAddFusionPass() = default;
+  ~MatMulAddFusionPass() override;
+  STATUS DefinePattern() override;
+  STATUS DoFusion(MetaGraphT *graph, const std::string &pattern_name,
+                  const std::unordered_map<std::string, std::shared_ptr<Path>> &matched_path) override;
+  STATUS Run(MetaGraphT *graph) override;
+};
+}  // namespace lite
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_LEGACY_OPTIMIZER_FUSION_MATMUL_ADD_FUSION_PASS_H_
diff --git a/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc b/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc
new file mode 100644
index 00000000..d1a63c2d
--- /dev/null
+++ b/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc
@@ -0,0 +1,163 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h"
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <memory>
+#include "schema/inner/model_generated.h"
+#include "tools/common/meta_graph_utils.h"
+#include "src/train/optimizer/common/fusion_utils.h"
+namespace {
+constexpr std::string_view kFirstMatMulName = "MATMUL1";
+constexpr std::string_view kSecondMatMulName = "MATMUL2";
+constexpr std::string_view kAddName = "ADD";
+}  // namespace
+namespace mindspore {
+namespace lite {
+/*
+ * The subgraph such as the following.
+ *        any                any
+ *       /   \                |
+ *   matmul  matmul         matmul
+ *       \   /       ---->    |
+ *        add                any
+ *         |
+ *        any
+ */
+namespace {
+int CalNewMatMulNode(MetaGraphT *graph, const std::unique_ptr<mindspore::schema::CNodeT> &matmul_node1,
+                     const std::unique_ptr<mindspore::schema::CNodeT> &matmul_node2) {
+  auto &matrix_b_1 = graph->allTensors.at(matmul_node1->inputIndex.at(opt::kInputIndexOne));
+  auto &matrix_b_2 = graph->allTensors.at(matmul_node2->inputIndex.at(opt::kInputIndexOne));
+  if (matrix_b_1->dims != matrix_b_2->dims) {
+    MS_LOG(INFO) << "currently, matmul fusion only support the same shape tensor";
+    return RET_ERROR;
+  }
+  if (matrix_b_1->dataType != kNumberTypeFloat32 || matrix_b_2->dataType != kNumberTypeFloat32) {
+    MS_LOG(INFO) << "only support float32 data type";
+    return RET_ERROR;
+  }
+  auto matrix_b_1_data = reinterpret_cast<float *>(matrix_b_1->data.data());
+  auto matrix_b_2_data = reinterpret_cast<float *>(matrix_b_2->data.data());
+  int num_b = static_cast<int>(matrix_b_1->data.size() / sizeof(float));
+  for (int j = 0; j < num_b; ++j) {
+    matrix_b_1_data[j] += matrix_b_2_data[j];
+  }
+  return RET_OK;
+}
+}  // namespace
+STATUS MatMulMatMulAddFusionPass::DefinePattern() {
+  auto matmul_op1 = std::make_shared<PatternOp>();
+  MS_CHECK_TRUE_RET(matmul_op1 != nullptr, RET_NULL_PTR);
+  matmul_op1->id = kFirstMatMulName;
+  matmul_op1->types = {schema::PrimitiveType_MatMulFusion};
+  auto matmul_op2 = std::make_shared<PatternOp>();
+  MS_CHECK_TRUE_RET(matmul_op2 != nullptr, RET_NULL_PTR);
+  matmul_op2->id = kSecondMatMulName;
+  matmul_op2->types = {schema::PrimitiveType_MatMulFusion};
+  auto add_op = std::make_shared<PatternOp>();
+  MS_CHECK_TRUE_RET(add_op != nullptr, RET_NULL_PTR);
+  add_op->id = kAddName;
+  add_op->types = {schema::PrimitiveType_AddFusion};
+  add_op->left = matmul_op1;
+  add_op->right = matmul_op2;
+  auto fusion_pattern = std::make_unique<FusionPattern>("MatMulMatMulAddFusion");
+  MS_CHECK_TRUE_MSG(fusion_pattern != nullptr, RET_NULL_PTR, "new fusion_pattern failed");
+  fusion_pattern->AddPatternOp(matmul_op1);
+  fusion_pattern->AddPatternOp(matmul_op2);
+  fusion_pattern->AddPatternOp(add_op);
+  fusion_pattern->Finish();
+  this->patterns.emplace_back(fusion_pattern.release());
+  return RET_OK;
+}
+
+STATUS MatMulMatMulAddFusionPass::DoFusion(MetaGraphT *graph, const std::string &pattern_name,
+                                           const std::unordered_map<std::string, std::shared_ptr<Path>> &matched_path) {
+  MS_CHECK_TRUE_RET(graph != nullptr, RET_NULL_PTR);
+  if (matched_path.size() != opt::kMatchPathLenThree) {
+    MS_LOG(INFO) << "MatMul-MatMul-Add-Fusion should have three NodeIndex in matchedPair";
+    return RET_PARAM_INVALID;
+  }
+
+  size_t matmul_index1 = 0;
+  auto ret = opt::GetMatchNodeIndex(graph, matched_path, std::string(kFirstMatMulName), &matmul_index1);
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "cannot get matmul_index1");
+  auto &matmul_node1 = graph->nodes.at(matmul_index1);
+  MS_CHECK_TRUE_MSG(matmul_node1 != nullptr, RET_NULL_PTR, "matmul_node1 is nullptr");
+  size_t matmul_index2 = 0;
+  ret = opt::GetMatchNodeIndex(graph, matched_path, std::string(kSecondMatMulName), &matmul_index2);
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "cannot get matmul_index2");
+  auto &matmul_node2 = graph->nodes.at(matmul_index2);
+  MS_CHECK_TRUE_MSG(matmul_node2 != nullptr, RET_NULL_PTR, "matmul_node2 is nullptr");
+  MS_CHECK_TRUE_MSG(matmul_node1->inputIndex.size() > C1NUM && matmul_node2->inputIndex.size() > C1NUM,
+                    RET_PARAM_INVALID, "matmul should have two input at least");
+  if (matmul_node1->inputIndex.size() < matmul_node2->inputIndex.size()) {
+    matmul_node1.swap(matmul_node2);
+  }
+  size_t add_index = 0;
+  ret = opt::GetMatchNodeIndex(graph, matched_path, std::string(kAddName), &add_index);
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "cannot get add_index");
+  auto &add_node = graph->nodes.at(add_index);
+  MS_CHECK_TRUE_MSG(add_node != nullptr, RET_NULL_PTR, "add_node is nullptr");
+
+  if (matmul_node1->quantType == schema::QuantType_QUANT_ALL ||
+      matmul_node1->quantType == schema::QuantType_QUANT_DYNAMIC ||
+      matmul_node2->quantType == schema::QuantType_QUANT_ALL ||
+      matmul_node2->quantType == schema::QuantType_QUANT_DYNAMIC ||
+      add_node->quantType == schema::QuantType_QUANT_ALL || add_node->quantType == schema::QuantType_QUANT_DYNAMIC) {
+    MS_LOG(DEBUG) << "cannot fusion with quant node";
+    return RET_NO_CHANGE;
+  }
+  MS_CHECK_TRUE_RET(matmul_node1->primitive != nullptr, RET_NULL_PTR);
+  auto matmul_type1 = matmul_node1->primitive->value.AsMatMulFusion()->activation_type;
+  MS_CHECK_TRUE_RET(matmul_node2->primitive != nullptr, RET_NULL_PTR);
+  auto matmul_type2 = matmul_node2->primitive->value.AsMatMulFusion()->activation_type;
+  MS_CHECK_TRUE_RET(add_node->primitive != nullptr, RET_NULL_PTR);
+  auto add_type = add_node->primitive->value.AsAddFusion()->activation_type;
+  MS_CHECK_TRUE_RET(matmul_type1 == ActivationType::ActivationType_NO_ACTIVATION &&
+                      matmul_type2 == ActivationType::ActivationType_NO_ACTIVATION &&
+                      add_type == ActivationType::ActivationType_NO_ACTIVATION,
+                    RET_NO_CHANGE);
+
+  if (matmul_node1->inputIndex.at(FIRST_INPUT) != matmul_node2->inputIndex.at(FIRST_INPUT)) {
+    MS_LOG(INFO) << "matmul should have the same first input";
+    return RET_NO_CHANGE;
+  }
+  auto &matmul_left_b = graph->allTensors[matmul_node1->inputIndex.at(SECOND_INPUT)];
+  auto &matmul_right_b = graph->allTensors[matmul_node2->inputIndex.at(SECOND_INPUT)];
+  if (matmul_left_b->data.empty() || matmul_right_b->data.empty()) {
+    return RET_NO_CHANGE;
+  }
+  if (CalNewMatMulNode(graph, matmul_node1, matmul_node2) != RET_OK) {
+    MS_LOG(INFO) << "failed to fusion two matmul";
+    return RET_NO_CHANGE;
+  }
+
+  matmul_node1->outputIndex = {add_node->outputIndex};
+  // cannot delete node here, otherwise will destroy order in other pattern's node index
+  // make it an isolated node to be removed in IsolatedNodeRemovePass
+  matmul_node2->inputIndex.clear();
+  matmul_node2->outputIndex.clear();
+  add_node->inputIndex.clear();
+  add_node->outputIndex.clear();
+  return RET_OK;
+}
+
+MatMulMatMulAddFusionPass::~MatMulMatMulAddFusionPass() = default;
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h b/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h
new file mode 100644
index 00000000..9ee6d711
--- /dev/null
+++ b/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_TRAIN_OPTIMIZER_FUSION_MATMUL_MATMUL_ADD_FUSION_PASS_H_
+#define MINDSPORE_LITE_SRC_TRAIN_OPTIMIZER_FUSION_MATMUL_MATMUL_ADD_FUSION_PASS_H_
+
+#include <string>
+#include <unordered_map>
+#include <memory>
+#include <algorithm>
+#include <utility>
+#include "tools/converter/legacy_optimizer/fusion/fusion_pass.h"
+
+namespace mindspore {
+namespace lite {
+class MatMulMatMulAddFusionPass : public FusionPass {
+ public:
+  MatMulMatMulAddFusionPass() = default;
+
+  ~MatMulMatMulAddFusionPass() override;
+
+  STATUS DefinePattern() override;
+
+  STATUS DoFusion(MetaGraphT *graph, const std::string &pattern_name,
+                  const std::unordered_map<std::string, std::shared_ptr<Path>> &matched_path) override;
+};
+}  // namespace lite
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_SRC_TRAIN_OPTIMIZER_FUSION_MATMUL_MATMUL_ADD_FUSION_PASS_H_
diff --git a/mindspore/lite/src/train/train_export.cc b/mindspore/lite/src/train/train_export.cc
index 7534ed2f..5bace006 100644
--- a/mindspore/lite/src/train/train_export.cc
+++ b/mindspore/lite/src/train/train_export.cc
@@ -151,11 +151,18 @@ int TrainExport::QuantTensorData(schema::TensorT *dest_tensor, const lite::Tenso
   return RET_OK;
 }

-std::unique_ptr<schema::TensorT> TrainExport::CreateTensor(const mindspore::lite::Tensor *tensor,
-                                                           schema::Tensor *scTensor, int preferred_dim,
-                                                           const int tensor_quant_type) {
+std::unique_ptr<schema::TensorT> TrainExport::CreateTensor(
+  const mindspore::lite::Tensor *tensor, const std::vector<mindspore::lite::Tensor *> const_folded_output,
+  schema::Tensor *scTensor, int preferred_dim, const int tensor_quant_type) {
   auto tensorT = std::make_unique<schema::TensorT>();
-  tensorT->nodeType = scTensor->nodeType();
+  bool const_fold = false;
+  if (quant_type_ == QT_NONE && !const_folded_output.empty() &&
+      std::find(const_folded_output.begin(), const_folded_output.end(), tensor) != const_folded_output.end()) {
+    tensorT->nodeType = NodeType_ValueNode;
+    const_fold = true;
+  } else {
+    tensorT->nodeType = scTensor->nodeType();
+  }
   tensorT->dims = tensor->shape();
   tensorT->format = static_cast<schema::Format>(tensor->format());
   tensorT->name = tensor->tensor_name();
@@ -163,7 +170,8 @@ std::unique_ptr<schema::TensorT> TrainExport::CreateTensor(const mindspore::lite
   tensorT->offset = 0;
   tensorT->dataType = tensor->data_type();
   tensorT->enableHuffmanCode = false;
-  if ((tensorT->nodeType == NodeType_ValueNode) && (scTensor->data() != nullptr) && (scTensor->data()->size() > 0)) {
+  if (((tensorT->nodeType == NodeType_ValueNode) && (scTensor->data() != nullptr) && (scTensor->data()->size() > 0)) ||
+      const_fold) {
     if (NeedQuantization(tensor, tensor_quant_type)) {
       auto ret = QuantTensorData(tensorT.get(), tensor, preferred_dim);
       if (ret != RET_OK) {
@@ -392,6 +400,7 @@ int TrainExport::KeepGraphInputsInOrder(const Model *model) {
   return RET_OK;
 }
 int TrainExport::ExportTensor(const Model *model, const std::vector<mindspore::lite::Tensor *> &tensors, int offset,
+                              const std::vector<mindspore::lite::Tensor *> const_folded_output,
                               const std::vector<std::pair<size_t, tensor_info>> &map_index,
                               const std::vector<std::string> &output_names, const std::set<size_t> &out_set) {
   std::vector<mindspore::lite::Tensor *> in_tensors;
@@ -401,6 +410,7 @@ int TrainExport::ExportTensor(const Model *model, const std::vector<mindspore::l
     mindspore::lite::Tensor *tensor = tensors.at(pid);
     in_tensors.push_back(tensor);
   }
+  std::map<std::string, uint32_t> ordered_output_names;
   for (auto index : map_index) {
     auto id = index.first;
     size_t pid = id - static_cast<size_t>(offset);
@@ -408,7 +418,8 @@ int TrainExport::ExportTensor(const Model *model, const std::vector<mindspore::l
     schema::Tensor *scTensor = model->graph_.all_tensors_.at(pid);
     auto preferred_dim = WeightDecoder::GetPreferredDim(in_tensors, index.second.op_parameter, index.second.input_index,
                                                         tensor->shape(), model->graph_.version_);
-    auto tensorT = CreateTensor(tensor, scTensor, preferred_dim, index.second.op_parameter->quant_type_);
+    auto tensorT =
+      CreateTensor(tensor, const_folded_output, scTensor, preferred_dim, index.second.op_parameter->quant_type_);
     if (tensorT == nullptr) {
       MS_LOG(ERROR) << "error in tensor creation";
       return RET_ERROR;
@@ -423,21 +434,27 @@ int TrainExport::ExportTensor(const Model *model, const std::vector<mindspore::l
     }
     // find output tensor
     if (std::find(output_names.begin(), output_names.end(), tensor->tensor_name()) != output_names.end()) {
-      meta_graph_->outputIndex.push_back(remap_[id]);
-      if (!meta_graph_->subGraph.empty()) {
-        meta_graph_->subGraph[0]->outputIndices.push_back(remap_[id]);
-      }
+      ordered_output_names[tensor->tensor_name()] = remap_[id];
     }
     meta_graph_->allTensors.emplace_back(std::move(tensorT));
     if (!meta_graph_->subGraph.empty()) {
       meta_graph_->subGraph[0]->tensorIndices.push_back(meta_graph_->allTensors.size() - 1);
     }
   }
+  for (auto &output_name : output_names) {
+    if (ordered_output_names.find(output_name) != ordered_output_names.end()) {
+      meta_graph_->outputIndex.push_back(ordered_output_names[output_name]);
+      if (!meta_graph_->subGraph.empty()) {
+        meta_graph_->subGraph[0]->outputIndices.push_back(ordered_output_names[output_name]);
+      }
+    }
+  }
   return RET_OK;
 }

 int TrainExport::ExportNet(const std::vector<mindspore::kernel::KernelExec *> &kernels,
                            const std::vector<mindspore::lite::Tensor *> &tensors,
+                           const std::vector<mindspore::lite::Tensor *> const_folded_output,
                            const std::vector<std::string> &output_names, const Model *model,
                            QuantizationType quant_type, const Model *bb_model) {
   std::vector<std::pair<size_t, tensor_info>> map_index;
@@ -498,7 +515,7 @@ int TrainExport::ExportNet(const std::vector<mindspore::kernel::KernelExec *> &k
     }
   }

-  auto status = ExportTensor(model, tensors, offset, map_index, output_names, out_set);
+  auto status = ExportTensor(model, tensors, offset, const_folded_output, map_index, output_names, out_set);
   if (status != RET_OK) {
     MS_LOG(ERROR) << "ExportTensor failed.";
     return RET_ERROR;
diff --git a/mindspore/lite/src/train/train_export.h b/mindspore/lite/src/train/train_export.h
index b44f6526..8428c9b9 100644
--- a/mindspore/lite/src/train/train_export.h
+++ b/mindspore/lite/src/train/train_export.h
@@ -47,8 +47,10 @@ class TrainExport {
   explicit TrainExport(Buffer *model_buffer) : model_buffer_(model_buffer) {}
   virtual ~TrainExport();
   int ExportNet(const std::vector<mindspore::kernel::KernelExec *> &kernels,
-                const std::vector<mindspore::lite::Tensor *> &tensors, const std::vector<std::string> &output_names,
-                const Model *model, QuantizationType quant_type, const Model *bb_model = nullptr);
+                const std::vector<mindspore::lite::Tensor *> &tensors,
+                const std::vector<mindspore::lite::Tensor *> const_folded_output,
+                const std::vector<std::string> &output_names, const Model *model, QuantizationType quant_type,
+                const Model *bb_model = nullptr);
   int ExportInit(const std::string model_name, std::string version);
   int SaveToFile();
   int SaveToBuffer();
@@ -75,7 +77,9 @@ class TrainExport {
   int TopologicalSort();
   void PrepareRemap(int offset);
   LiteGraph::Node *FindNode(const mindspore::kernel::KernelExec *kernel, const Model *model);
-  std::unique_ptr<schema::TensorT> CreateTensor(const Tensor *tensor, schema::Tensor *scTensor, int preferred_dim,
+  std::unique_ptr<schema::TensorT> CreateTensor(const Tensor *tensor,
+                                                const std::vector<mindspore::lite::Tensor *> const_folded_output,
+                                                schema::Tensor *scTensor, int preferred_dim,
                                                 const int tensor_quant_type);
   std::unique_ptr<schema::CNodeT> CreateCNode(const mindspore::kernel::KernelExec *kernel,
                                               std::vector<uint32_t> inputIndex, std::vector<uint32_t> outputIndex,
@@ -93,6 +97,7 @@ class TrainExport {
                              size_t *target_index);
   int KeepGraphInputsInOrder(const Model *model);
   int ExportTensor(const Model *model, const std::vector<mindspore::lite::Tensor *> &tensors, int offset,
+                   const std::vector<mindspore::lite::Tensor *> const_folded_output,
                    const std::vector<std::pair<size_t, tensor_info>> &map_index,
                    const std::vector<std::string> &output_names, const std::set<size_t> &out_set);
   virtual int QuantTensorData(schema::TensorT *dest_tensor, const mindspore::lite::Tensor *src_tensor,
diff --git a/mindspore/lite/src/train/train_session.cc b/mindspore/lite/src/train/train_session.cc
index b581b389..c123cba8 100644
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@@ -399,6 +399,8 @@ int TrainSession::CompileTrainGraph(std::shared_ptr<Model> model) {
     MS_LOG(ERROR) << "failed to allocate space";
     return RET_ERROR;
   }
+  // Prepare a list of kernels which are const folded
+  MS_CHECK_TRUE_MSG(CompileConstFoldedKernels() == RET_OK, RET_ERROR, "CompileConstFoldedKernels failed.");
   return RET_OK;
 }

@@ -697,20 +699,30 @@ void TrainSession::CompileEvalOutputs() {
         }
         if (is_loss) continue;
         // insert if not already in
-        if (eval_output_node_map_.find(in_kernel->name()) == eval_output_node_map_.end()) {
-          auto *ms_tensor = in_kernel->out_tensors().at(0);
-          if (ms_tensor != nullptr) {
-            ms_tensor->set_init_ref_count(ms_tensor->init_ref_count() + 1);
-            eval_output_node_map_[in_kernel->name()].emplace_back(ms_tensor);
-            auto index = TSFindTensor(tensors_, ms_tensor);
-            if (index != tensors_.size()) {
-              if (!ms_tensor->tensor_name().empty()) {
-                eval_output_tensor_map_.insert(std::make_pair(ms_tensor->tensor_name(), ms_tensor));
-                eval_output_tensor_names_.emplace_back(ms_tensor->tensor_name());
-              } else {
-                eval_output_tensor_map_.insert(std::make_pair(std::to_string(index), ms_tensor));
-                eval_output_tensor_names_.emplace_back(std::to_string(index));
-              }
+        auto out_tensors = TSFindTensors(in_kernel, kernel);
+        if (eval_output_node_map_.find(in_kernel->name()) != eval_output_node_map_.end()) {
+          auto exist_out_tensors = eval_output_node_map_[in_kernel->name()];
+          std::vector<Tensor *> all_out_tensors;
+          auto kernel_all_out_tensors = in_kernel->out_tensors();
+          eval_output_node_map_[in_kernel->name()] = {};
+          for (auto tensor : kernel_all_out_tensors) {
+            if (std::find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end() ||
+                std::find(exist_out_tensors.begin(), exist_out_tensors.end(), tensor) != exist_out_tensors.end()) {
+              eval_output_node_map_[in_kernel->name()].emplace_back(tensor);
+            }
+          }
+        } else {
+          eval_output_node_map_[in_kernel->name()] = out_tensors;
+        }
+        for (auto out_tensor : out_tensors) {
+          auto index = TSFindTensor(tensors_, out_tensor);
+          if (index != tensors_.size()) {
+            if (!out_tensor->tensor_name().empty()) {
+              eval_output_tensor_map_.insert(std::make_pair(out_tensor->tensor_name(), out_tensor));
+              eval_output_tensor_names_.emplace_back(out_tensor->tensor_name());
+            } else {
+              eval_output_tensor_map_.insert(std::make_pair(std::to_string(index), out_tensor));
+              eval_output_tensor_names_.emplace_back(std::to_string(index));
             }
           }
         }
@@ -863,6 +875,35 @@ void TrainSession::CompileOptimizedKernels() {
   }
 }

+int TrainSession::CompileConstFoldedKernels() {
+  const_output_tensors_.clear();
+  for (auto kernel : this->inference_kernels_) {
+    bool is_input_const = true;
+    for (auto input : kernel->in_tensors()) {
+      if ((!input->IsConst() || input->IsGraphInput()) &&
+          std::find(const_output_tensors_.begin(), const_output_tensors_.end(), input) == const_output_tensors_.end()) {
+        is_input_const = false;
+      }
+      if (!is_input_const) {
+        const_fold_kernels_.emplace_back(kernel);
+        break;
+      }
+    }
+    if (is_input_const) {
+      auto ret = kernel->Execute();
+      if (RET_OK != ret) {
+        MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
+        return ret;
+      }
+      for (auto output : kernel->out_tensors()) {
+        const_output_tensors_.emplace_back(output);
+        output->set_category(Category::CONST_TENSOR);
+      }
+    }
+  }
+  return RET_OK;
+}
+
 void TrainSession::CompileTrainableParams() {
   for (auto kernel : this->train_kernels_) {
     if (!IsOptimizer(kernel)) {
@@ -1214,9 +1255,10 @@ int TrainSession::ExportByDifferentType(DestType destination, ModelType model_ty
   TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Fail to init export");
   if (!output_tensor_name.empty() && model_type == MT_INFERENCE) {
     std::vector<kernel::KernelExec *> export_kernels = {};
-    status = FindExportKernels(&export_kernels, output_tensor_name, inference_kernels_);
+    status = FindExportKernels(&export_kernels, output_tensor_name, const_fold_kernels_);
     TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "FindExportKernels failed.");
-    status = texport.ExportNet(export_kernels, tensors_, output_tensor_name, model_.get(), quant_type);
+    status =
+      texport.ExportNet(export_kernels, tensors_, const_output_tensors_, output_tensor_name, model_.get(), quant_type);
   } else {
     if (!output_tensor_name.empty() && model_type == MT_TRAIN) {
       MS_LOG(WARNING) << "Train model does not support to export selected output tensor, and all of the train kernels "
@@ -1234,9 +1276,15 @@ int TrainSession::ExportByDifferentType(DestType destination, ModelType model_ty
       }
       return status;
     } else {
-      status = texport.ExportNet((model_type == MT_TRAIN) ? train_kernels_ : inference_kernels_, tensors_,
-                                 (model_type == MT_TRAIN) ? train_output_tensor_names_ : eval_output_tensor_names_,
-                                 model_.get(), quant_type);
+      if (quant_type == QT_NONE) {
+        status = texport.ExportNet(
+          (model_type == MT_TRAIN) ? train_kernels_ : const_fold_kernels_, tensors_, const_output_tensors_,
+          (model_type == MT_TRAIN) ? train_output_tensor_names_ : eval_output_tensor_names_, model_.get(), quant_type);
+      } else {
+        status = texport.ExportNet((model_type == MT_TRAIN) ? train_kernels_ : inference_kernels_, tensors_, {},
+                                   (model_type == MT_TRAIN) ? train_output_tensor_names_ : eval_output_tensor_names_,
+                                   model_.get(), quant_type);
+      }
     }
   }
   TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Fail to export Network.");
@@ -1322,14 +1370,13 @@ int TrainSession::ExportWeightsCollaborateWithMicro(const std::string &file_name
   MS_CHECK_FALSE_MSG(format != FT_FLATBUFFERS, RET_ERROR, "File name cannot be empty");
   MS_CHECK_FALSE_MSG(model_type != mindspore::lite::MT_INFERENCE, RET_ERROR,
                      "Currently, can only export inference-model's weights.");
-  int status = Eval();
-  TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Eval failed");

   TrainExport texport(file_name);
-  status = texport.ExportInit(model_.get()->graph_.name_, model_.get()->graph_.version_);
+  auto status = texport.ExportInit(model_.get()->graph_.name_, model_.get()->graph_.version_);
   TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Fail to init export");

-  status = texport.ExportNet(inference_kernels_, tensors_, eval_output_tensor_names_, model_.get(), QT_DEFAULT);
+  status = texport.ExportNet(const_fold_kernels_, tensors_, const_output_tensors_, eval_output_tensor_names_,
+                             model_.get(), QT_NONE);
   TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Fail to export Network.");
   status = texport.TrainModelDrop();
   TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "TrainModelDrop failed.");
diff --git a/mindspore/lite/src/train/train_session.h b/mindspore/lite/src/train/train_session.h
index 24f10065..0bd14b21 100644
--- a/mindspore/lite/src/train/train_session.h
+++ b/mindspore/lite/src/train/train_session.h
@@ -128,6 +128,7 @@ class TrainSession : virtual public lite::LiteSession {
   virtual int CompileInferenceKernels();
   virtual void CompileOptimizedKernels();
   virtual void CompileTrainableParams();
+  virtual int CompileConstFoldedKernels();
   virtual void CompileTrainOutputs();
   virtual void CompileEvalOutputs();
   virtual int InitCallBack();
@@ -146,6 +147,8 @@ class TrainSession : virtual public lite::LiteSession {

   std::vector<kernel::KernelExec *> inference_kernels_;
   std::vector<kernel::KernelExec *> train_kernels_;
+  std::vector<kernel::KernelExec *> const_fold_kernels_;
+  std::vector<lite::Tensor *> const_output_tensors_;
   TrainCfg cfg_;

  private:
diff --git a/mindspore/lite/src/train/train_utils.cc b/mindspore/lite/src/train/train_utils.cc
index 32c4a502..cb7b669a 100644
--- a/mindspore/lite/src/train/train_utils.cc
+++ b/mindspore/lite/src/train/train_utils.cc
@@ -204,5 +204,20 @@ int ScaleTensor(Tensor *tensor, float scale) {
   MS_LOG(DEBUG) << "Scale tensor: " << tensor->tensor_name() << " " << scale;
   return tensor->Scale<float>(scale);
 }
+
+std::vector<Tensor *> TSFindTensors(const kernel::KernelExec *pre_kernel, const kernel::KernelExec *post_kernel) {
+  MS_ASSERT(pre_kernel != nullptr);
+  MS_ASSERT(post_kernel != nullptr);
+  auto out_tensors = pre_kernel->out_tensors();
+  auto in_tensors = post_kernel->in_tensors();
+  std::vector<Tensor *> res;
+  for (auto tensor : out_tensors) {
+    if (std::find(in_tensors.begin(), in_tensors.end(), tensor) == in_tensors.end()) {
+      continue;
+    }
+    res.push_back(tensor);
+  }
+  return res;
+}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/train/train_utils.h b/mindspore/lite/src/train/train_utils.h
index 5c85738f..9b2d62dc 100644
--- a/mindspore/lite/src/train/train_utils.h
+++ b/mindspore/lite/src/train/train_utils.h
@@ -36,6 +36,7 @@ float CalculateSparseClassification(lite::Tensor *input, lite::Tensor *output);
 float CalculateOneHotClassification(lite::Tensor *input, lite::Tensor *output);
 Tensor *CastTensor(Tensor *tensor, TypeId dst_data_type, bool support_fp16);
 int ScaleTensor(Tensor *tensor, float scale);
+std::vector<Tensor *> TSFindTensors(const kernel::KernelExec *pre_kernel, const kernel::KernelExec *post_kernel);
 }  // namespace lite
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_TRAIN_TRAIN_UTILS_H_
diff --git a/mindspore/lite/src/train/transfer_session.cc b/mindspore/lite/src/train/transfer_session.cc
index 48191b4f..b1cb7b3e 100644
--- a/mindspore/lite/src/train/transfer_session.cc
+++ b/mindspore/lite/src/train/transfer_session.cc
@@ -230,10 +230,10 @@ int TransferSession::ExportInner(DestType destination, ModelType model_type, Qua
       MS_LOG(ERROR) << "FindExportKernels failed.";
       return RET_ERROR;
     }
-    status = texport.ExportNet(export_kernels, tensors_, out_put_tensor_name, model_.get(), quant_type,
+    status = texport.ExportNet(export_kernels, tensors_, {}, out_put_tensor_name, model_.get(), quant_type,
                                backbone_session_->model_);
   } else {
-    status = texport.ExportNet(inference_kernels_, tensors_, GetOutputTensorNames(), model_.get(), quant_type,
+    status = texport.ExportNet(inference_kernels_, tensors_, {}, GetOutputTensorNames(), model_.get(), quant_type,
                                backbone_session_->model_);
   }
   if (status != RET_OK) {
diff --git a/mindspore/lite/tools/common/string_util.cc b/mindspore/lite/tools/common/string_util.cc
index 8d7076e5..13cddb3a 100644
--- a/mindspore/lite/tools/common/string_util.cc
+++ b/mindspore/lite/tools/common/string_util.cc
@@ -199,5 +199,9 @@ size_t Hex2ByteArray(const std::string &hex_str, unsigned char *byte_array, size
   }
   return byte_len;
 }
+
+bool IsNumber(const std::string &item) {
+  return std::all_of(item.begin(), item.end(), [](char ch) { return ch >= '0' && ch <= '9'; });
+}
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/common/string_util.h b/mindspore/lite/tools/common/string_util.h
index 0fb9c0b2..95bdd742 100644
--- a/mindspore/lite/tools/common/string_util.h
+++ b/mindspore/lite/tools/common/string_util.h
@@ -45,6 +45,8 @@ bool ConvertBool(std::string str, bool *value);
 bool ConvertDoubleVector(const std::string &str, std::vector<double> *value);

 size_t Hex2ByteArray(const std::string &hex_str, unsigned char *byte_array, size_t max_len);
+
+bool IsNumber(const std::string &item);
 }  // namespace lite
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_TOOLS_COMMON_STRING_UTIL_H_
diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc
index c4f84163..b63912fa 100644
--- a/mindspore/lite/tools/converter/anf_transform.cc
+++ b/mindspore/lite/tools/converter/anf_transform.cc
@@ -135,6 +135,7 @@
 #include "tools/common/string_util.h"
 #include "src/common/common.h"
 #include "tools/optimizer/graph/miniaturization_pass.h"
+#include "tools/optimizer/fusion/tile_matmul_fusion.h"

 using std::string;
 namespace mindspore::lite {
@@ -317,7 +318,8 @@ std::vector<opt::PassPtr> InitFusions(const std::shared_ptr<ConverterPara> &para
                                     std::make_shared<opt::MulActivationFusion>(),
                                     std::make_shared<opt::AddActivationFusion>(),
                                     std::make_shared<opt::ExpandDimsReshapeFusion>(),
-                                    std::make_shared<opt::SqueezeExpandDimsFusion>()};
+                                    std::make_shared<opt::SqueezeExpandDimsFusion>(),
+                                    std::make_shared<opt::TileMatMulFusion>()};
   if (param->optimize_transformer) {
     fusions.push_back(std::make_shared<opt::MultiHeadAttentionFusion>());
     fusions.push_back(std::make_shared<opt::EncoderLayerFusion>());
diff --git a/mindspore/lite/tools/converter/config_parser/config_file_parser.cc b/mindspore/lite/tools/converter/config_parser/config_file_parser.cc
index 2e7ca749..7b47fb8c 100644
--- a/mindspore/lite/tools/converter/config_parser/config_file_parser.cc
+++ b/mindspore/lite/tools/converter/config_parser/config_file_parser.cc
@@ -19,10 +19,10 @@
 #include "include/errorcode.h"
 #include "src/common/log_adapter.h"
 #include "tools/converter/converter_context.h"
-
 #include "tools/common/string_util.h"
 #include "src/common/config_infos.h"
 #include "src/common/common.h"
+#include "nnacl/op_base.h"

 namespace mindspore {
 namespace lite {
@@ -208,6 +208,75 @@ void SetDynParams(const std::shared_ptr<mindspore::ConverterPara> &param,
   }
 }

+int ParseInputShapeTemplate(const std::string &shape_template, std::set<std::string> *dynamic_symbols) {
+  // the inputs_shape config is like: input1:[d0,d1,3];input2:[4,d0]
+  auto graph_inputs_shape_vec = SplitStringToVector(shape_template, ';');
+  for (const auto &graph_input_shape : graph_inputs_shape_vec) {
+    auto graph_input_shape_info = SplitStringToVector(graph_input_shape, ':');
+    MS_CHECK_TRUE_MSG(graph_input_shape_info.size() == kIndex2, RET_INPUT_PARAM_INVALID, "the inputs_shape is invalid");
+    auto input_shape = graph_input_shape_info[1];
+    if (input_shape[0] != '[' || input_shape[input_shape.size() - 1] != ']') {
+      MS_LOG(ERROR) << "the inputs_shape is invalid";
+      return RET_INPUT_PARAM_INVALID;
+    }
+    input_shape = input_shape.substr(1, input_shape.size() - kIndex2);
+    auto input_shape_vec = SplitStringToVector(input_shape, ',');
+    for (const auto &shape : input_shape_vec) {
+      if (!IsNumber(shape)) {
+        dynamic_symbols->insert(shape);
+      }
+    }
+  }
+  return RET_OK;
+}
+
+int ParseDynmiacDimTemplate(const std::string &dims_template, std::set<std::string> *dynamic_symbols,
+                            MicroParamString *micro_param_string) {
+  // the dynamic_dim_params config is like: d0:[1,3~6];d1:[1~8]
+  auto dim_info_vec = SplitStringToVector(dims_template, ';');
+  MS_CHECK_TRUE_MSG(dim_info_vec.size() <= kIndex2, RET_NOT_SUPPORT, "currently, only support to set two dynamic dims");
+  for (const auto &dim_info : dim_info_vec) {
+    auto dim_vec = SplitStringToVector(dim_info, ':');
+    MS_CHECK_TRUE_MSG(dim_vec.size() == kIndex2, RET_INPUT_PARAM_INVALID, "the dynamic_dim_params is invalid");
+    std::string symbol = dim_vec[0];
+    if (dynamic_symbols->find(symbol) == dynamic_symbols->end()) {
+      MS_LOG(ERROR) << symbol << "is invalid, because it's not set in the inputs_shape.";
+      return RET_INPUT_PARAM_INVALID;
+    }
+    std::string dim_range = dim_vec[1];
+    if (dim_range[0] != '[' || dim_range[dim_range.size() - 1] != ']') {
+      MS_LOG(ERROR) << "the dynamic_dim_params is invalid";
+      return RET_INPUT_PARAM_INVALID;
+    }
+    dim_range = dim_range.substr(1, dim_range.size() - kIndex2);
+    auto discrete_vec = SplitStringToVector(dim_range, ',');
+    for (const auto &dim : discrete_vec) {
+      auto continuous_dim = SplitStringToVector(dim, '~');
+      MS_CHECK_TRUE_MSG(continuous_dim.size() == C1NUM || continuous_dim.size() == kIndex2, RET_INPUT_PARAM_INVALID,
+                        "the dynamic_dim_params is invalid");
+      if (continuous_dim.size() == C1NUM) {
+        if (!IsNumber(continuous_dim[0]) || std::stoi(continuous_dim[0]) <= 0) {
+          MS_LOG(ERROR) << "the dynamic_dim_params range value must be greater than 0";
+          return RET_INPUT_PARAM_INVALID;
+        }
+        micro_param_string->dynamic_symbols_map[symbol] += continuous_dim[0] + ",";
+        continue;
+      }
+      if (!IsNumber(continuous_dim[0]) || std::stoi(continuous_dim[0]) <= 0 || !IsNumber(continuous_dim[1]) ||
+          std::stoi(continuous_dim[1]) <= 0) {
+        MS_LOG(ERROR) << "the dynamic_dim_params range value must be greater than 0";
+        return RET_INPUT_PARAM_INVALID;
+      }
+      auto start = std::stoi(continuous_dim[0]);
+      auto end = std::stoi(continuous_dim[1]);
+      for (auto i = start; i <= end; ++i) {
+        micro_param_string->dynamic_symbols_map[symbol] += std::to_string(i) + ",";
+      }
+    }
+  }
+  return RET_OK;
+}
+
 void ConfigFileParser::SetParamByConfigfile(const std::shared_ptr<mindspore::ConverterPara> &param,
                                             const std::map<std::string, std::string> &ascend_map) {
   std::string ascend_string = "";
@@ -377,8 +446,12 @@ int ConfigFileParser::ParseConfigParam(std::map<std::string, std::map<std::strin
 }

 int ConfigFileParser::SetMapData(const std::map<std::string, std::string> &input_map,
-                                 const std::map<std::string, std::string &> &parse_map, const std::string &section) {
+                                 const std::map<std::string, std::string &> &parse_map, const std::string &section,
+                                 const std::set<std::string> &dynamic_key) {
   for (const auto &map : input_map) {
+    if (dynamic_key.find(map.first) != dynamic_key.end()) {
+      continue;
+    }
     if (parse_map.find(map.first) == parse_map.end()) {
       MS_LOG(ERROR) << "INPUT ILLEGAL: `" << map.first << "` is not supported in "
                     << "[" << section << "]";
@@ -511,21 +584,34 @@ int ConfigFileParser::ParseAclOptionCfgString(const std::map<std::string, std::m
 }

 int ConfigFileParser::ParseMicroParamString(const std::map<std::string, std::map<std::string, std::string>> &maps) {
-  if (maps.find(kMicroParam) != maps.end()) {
-    const auto &map = maps.at(kMicroParam);
-    std::map<std::string, std::string &> parse_map{
-      {"target", micro_param_string_.target},
-      {"codegen_mode", micro_param_string_.codegen_mode},
-      {"debug_mode", micro_param_string_.debug_mode},
-      {"support_parallel", micro_param_string_.support_parallel},
-      {"enable_micro", micro_param_string_.enable_micro},
-      {"save_path", micro_param_string_.save_path},
-      {"project_name", micro_param_string_.project_name},
-      {"keep_original_weight", micro_param_string_.keep_original_weight},
-      {"changeable_weights_name", micro_param_string_.changeable_weights_name}};
-    return SetMapData(map, parse_map, kMicroParam);
+  if (maps.find(kMicroParam) == maps.end()) {
+    return RET_OK;
   }
-  return RET_OK;
+  const auto &map = maps.at(kMicroParam);
+  const std::string graph_inputs_shape_template = "inputs_shape";
+  std::set<std::string> dynamic_symbols;
+  if (map.find(graph_inputs_shape_template) != map.end()) {
+    const auto &shape_template = map.at(graph_inputs_shape_template);
+    ParseInputShapeTemplate(shape_template, &dynamic_symbols);
+  }
+  const std::string dynamic_dims = "dynamic_dim_params";
+  if (!dynamic_symbols.empty() && map.find(dynamic_dims) != map.end()) {
+    const auto &dims_template = map.at(dynamic_dims);
+    ParseDynmiacDimTemplate(dims_template, &dynamic_symbols, &micro_param_string_);
+  }
+  std::map<std::string, std::string &> parse_map{
+    {"target", micro_param_string_.target},
+    {"codegen_mode", micro_param_string_.codegen_mode},
+    {"debug_mode", micro_param_string_.debug_mode},
+    {"support_parallel", micro_param_string_.support_parallel},
+    {"enable_micro", micro_param_string_.enable_micro},
+    {"save_path", micro_param_string_.save_path},
+    {"project_name", micro_param_string_.project_name},
+    {"keep_original_weight", micro_param_string_.keep_original_weight},
+    {"changeable_weights_name", micro_param_string_.changeable_weights_name},
+    {"inputs_shape", micro_param_string_.inputs_shape},
+    {"dynamic_dim_params", micro_param_string_.dynamic_dim_params}};
+  return SetMapData(map, parse_map, kMicroParam);
 }

 int ConfigFileParser::ParseWeightQuantString(const std::map<std::string, std::map<std::string, std::string>> &maps) {
diff --git a/mindspore/lite/tools/converter/config_parser/config_file_parser.h b/mindspore/lite/tools/converter/config_parser/config_file_parser.h
index 6997bac8..163782b7 100644
--- a/mindspore/lite/tools/converter/config_parser/config_file_parser.h
+++ b/mindspore/lite/tools/converter/config_parser/config_file_parser.h
@@ -108,17 +108,20 @@ struct MicroParamString {
   std::string project_name;
   std::string keep_original_weight;
   std::string changeable_weights_name;
+  std::string inputs_shape;
+  std::string dynamic_dim_params;
+  std::map<std::string, std::string> dynamic_symbols_map;
 };

 struct ThirdPartyModelString {
   std::string input_dtypes;
   std::string input_shapes;
-  std::string input_names;  // optional, default: ""
+  std::string input_names;    // optional, default: ""
   std::string input_formats;  // optional, default: NHWC
   std::string output_dtypes;
   std::string output_shapes;
-  std::string output_names;  // optional, default: ""
-  std::string output_formats;  // optional, default: NHWC
+  std::string output_names;         // optional, default: ""
+  std::string output_formats;       // optional, default: NHWC
   std::string extended_parameters;  // format: {key1:value1;ker2:value2}
 };

@@ -172,7 +175,8 @@ class ConfigFileParser {
   int ParseRegistryInfoString(const std::map<std::string, std::map<std::string, std::string>> &maps);
   int ParseAclOptionCfgString(const std::map<std::string, std::map<std::string, std::string>> &maps);
   int SetMapData(const std::map<std::string, std::string> &input_map,
-                 const std::map<std::string, std::string &> &parse_map, const std::string &section);
+                 const std::map<std::string, std::string &> &parse_map, const std::string &section,
+                 const std::set<std::string> &dynamic_key = {});
   int ParseMicroParamString(const std::map<std::string, std::map<std::string, std::string>> &maps);
   int ParseThirdPartyParamString(const std::map<std::string, std::map<std::string, std::string>> &sections);
   int ParseCpuOptionCfgString(const std::map<std::string, std::map<std::string, std::string>> &maps);
diff --git a/mindspore/lite/tools/converter/config_parser/micro_param_parser.cc b/mindspore/lite/tools/converter/config_parser/micro_param_parser.cc
index c9998cc8..903f2863 100644
--- a/mindspore/lite/tools/converter/config_parser/micro_param_parser.cc
+++ b/mindspore/lite/tools/converter/config_parser/micro_param_parser.cc
@@ -19,6 +19,7 @@
 #include "tools/common/string_util.h"
 #include "src/common/log_adapter.h"
 #include "src/common/log_util.h"
+#include "nnacl/op_base.h"

 namespace mindspore {
 namespace lite {
@@ -115,6 +116,80 @@ STATUS MicroParamParser::ParseChangeableWeightsName(const std::string &changeabl
   return RET_OK;
 }

+STATUS MicroParamParser::ParseGraphInputsShapeTemplate(const std::string &graph_inputs_shape_template,
+                                                       const std::map<std::string, std::string> &dynamic_symbols_map,
+                                                       micro::MicroParam *micro_param) {
+  MS_LOG(DEBUG) << "Micro record inputs shape: " << graph_inputs_shape_template;
+  if (!graph_inputs_shape_template.empty()) {
+    auto graph_inputs_shape_vec = SplitStringToVector(graph_inputs_shape_template, ';');
+    std::map<std::string, std::vector<std::string>> graph_inputs_info;
+    std::vector<std::vector<std::string>> graph_inputs_shape;
+    std::vector<std::string> inputs_name;
+    for (const auto &graph_input_shape : graph_inputs_shape_vec) {
+      auto input_shape_info = SplitStringToVector(graph_input_shape, ':');
+      std::string input_name = input_shape_info[0];
+      std::string input_shape = input_shape_info[1].substr(1, input_shape_info[1].size() - C2NUM);
+      auto input_shape_vec = SplitStringToVector(input_shape, ',');
+      graph_inputs_info[input_name] = input_shape_vec;
+      graph_inputs_shape.push_back(input_shape_vec);
+      inputs_name.push_back(input_name);
+    }
+    micro_param->graph_inputs_origin_info = graph_inputs_info;
+    micro_param->inputs_shape_by_scenes.clear();
+    std::map<std::string, std::vector<int>> symbols_to_num;
+    std::map<std::string, int> symbols_index;
+    std::vector<std::string> symbols;
+    std::vector<size_t> scene_num_by_symbol;
+    int index = 0;
+    size_t scene_num = 1;
+    for (const auto &item : dynamic_symbols_map) {
+      symbols_index[item.first] = index++;
+      symbols.push_back(item.first);
+      auto num_str_list = SplitStringToVector(item.second, ',');
+      for (const auto &num_str : num_str_list) {
+        symbols_to_num[item.first].push_back(std::stoi(num_str));
+      }
+      if (symbols_to_num[item.first].empty()) {
+        MS_LOG(ERROR) << "Micro param invalid, dynamic symbol must have value.";
+        return RET_INPUT_PARAM_INVALID;
+      }
+      scene_num_by_symbol.push_back(symbols_to_num[item.first].size());
+      scene_num *= symbols_to_num[item.first].size();
+    }
+    micro_param->dynamic_symbols = symbols;
+    micro_param->dynamic_symbols_num = scene_num_by_symbol;
+    std::vector<size_t> post_multi(symbols.size(), 1);
+    for (int i = static_cast<int>(post_multi.size()) - 2; i >= 0; --i) {
+      post_multi[i] = post_multi[i + 1] * scene_num_by_symbol[i + 1];
+    }
+    std::vector<int> real_num(symbols.size());
+    for (size_t i = 0; i < scene_num; ++i) {
+      size_t remain = i;
+      for (size_t j = 0; j < symbols.size(); ++j) {
+        real_num[j] = remain / post_multi[j];
+        remain %= post_multi[j];
+      }
+      for (size_t j = 0; j < graph_inputs_shape.size(); ++j) {
+        const auto &input_template = graph_inputs_shape[j];
+        std::vector<int> input_shape;
+        for (const auto &dim : input_template) {
+          if (IsNumber(dim)) {
+            input_shape.push_back(std::stoi(dim));
+            continue;
+          }
+          if (symbols_index.find(dim) == symbols_index.end()) {
+            MS_LOG(ERROR) << "Dynamic symbol cannot find real num.";
+            return RET_INPUT_PARAM_INVALID;
+          }
+          input_shape.push_back(symbols_to_num[dim][real_num[symbols_index[dim]]]);
+        }
+        micro_param->inputs_shape_by_scenes[inputs_name[j]].push_back(input_shape);
+      }
+    }
+  }
+  return RET_OK;
+}
+
 STATUS MicroParamParser::ParseMicroParam(const MicroParamString &micro_param_string, micro::MicroParam *micro_param) {
   CHECK_NULL_RETURN(micro_param);
   if (ParseTarget(micro_param_string.target, micro_param) != RET_OK) {
@@ -145,9 +220,11 @@ STATUS MicroParamParser::ParseMicroParam(const MicroParamString &micro_param_str
     MS_LOG(ERROR) << "Parse project name val failed: " << micro_param_string.project_name;
     return RET_INPUT_PARAM_INVALID;
   }
-  if (ParseKeepOriginalWeight(micro_param_string.keep_original_weight, micro_param) != RET_OK) {
-    MS_LOG(ERROR) << "Parse keep_original_weight failed, the val: " << micro_param_string.keep_original_weight;
-    return RET_INPUT_PARAM_INVALID;
+  if (!micro_param_string.keep_original_weight.empty()) {
+    if (ParseKeepOriginalWeight(micro_param_string.keep_original_weight, micro_param) != RET_OK) {
+      MS_LOG(ERROR) << "Parse keep_original_weight val； " << micro_param_string.keep_original_weight;
+      return RET_INPUT_PARAM_INVALID;
+    }
   }
   if (!micro_param_string.changeable_weights_name.empty() && !micro_param->keep_original_weight) {
     MS_LOG(ERROR) << "When changeable_weights_name is set, the keep_original_weight must be true.";
@@ -157,6 +234,12 @@ STATUS MicroParamParser::ParseMicroParam(const MicroParamString &micro_param_str
     MS_LOG(ERROR) << "Parse changeable_weights_name failed, the val: " << micro_param_string.changeable_weights_name;
     return RET_INPUT_PARAM_INVALID;
   }
+  if (ParseGraphInputsShapeTemplate(micro_param_string.inputs_shape, micro_param_string.dynamic_symbols_map,
+                                    micro_param) != RET_OK) {
+    MS_LOG(ERROR) << "Parse inputs_shape & dynamic_dim_params failed, the inputs_shape val: "
+                  << micro_param_string.inputs_shape;
+    return RET_INPUT_PARAM_INVALID;
+  }
   return RET_OK;
 }
 }  // namespace lite
diff --git a/mindspore/lite/tools/converter/config_parser/micro_param_parser.h b/mindspore/lite/tools/converter/config_parser/micro_param_parser.h
index b6efb4c7..eb95c571 100644
--- a/mindspore/lite/tools/converter/config_parser/micro_param_parser.h
+++ b/mindspore/lite/tools/converter/config_parser/micro_param_parser.h
@@ -37,6 +37,9 @@ class MicroParamParser {
   STATUS ParseProjName(const std::string &debug_mode, micro::MicroParam *micro_param);
   STATUS ParseKeepOriginalWeight(const std::string &keep_weight, micro::MicroParam *micro_param);
   STATUS ParseChangeableWeightsName(const std::string &changeable_weights_name, micro::MicroParam *micro_param);
+  STATUS ParseGraphInputsShapeTemplate(const std::string &graph_inputs_shape_template,
+                                       const std::map<std::string, std::string> &dynamic_symbols_map,
+                                       micro::MicroParam *micro_param);
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/converter.cc b/mindspore/lite/tools/converter/converter.cc
index a61bd51c..4703e889 100644
--- a/mindspore/lite/tools/converter/converter.cc
+++ b/mindspore/lite/tools/converter/converter.cc
@@ -56,6 +56,7 @@
 #include "src/common/file_utils.h"
 #include "ops/dynamic_shape.h"
 #include "tools/common/parse_config_utils.h"
+#include "src/common/file_utils.h"
 #include "tools/converter/converter_packed_node.h"
 #include "tools/converter/config_parser/cpu_option_param_parser.h"
 #include "tools/converter/export_model.h"
@@ -432,54 +433,34 @@ int ConverterImpl::InitConfigParam(const std::shared_ptr<ConverterPara> &param,
     MS_LOG(ERROR) << "Parse config param failed.";
     return ret;
   }
-  ret = ParseParam(&config_parser, param, model_param_infos, maps);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Parse param failed.";
-    return ret;
-  }
-  return RET_OK;
-}
-
-int ConverterImpl::ParseParam(lite::ConfigFileParser *config_parser, const std::shared_ptr<ConverterPara> &param,
-                              const std::map<int, std::map<std::string, std::string>> *model_param_infos,
-                              const std::map<std::string, std::map<std::string, std::string>> maps) {
-  param->config_infos = maps;
-  auto ret = RET_OK;
   if (model_param_infos->empty()) {
-    ret =
-      lite::PreprocessParser::ParsePreprocess(config_parser->GetDataPreProcessString(), &param->dataPreProcessParam);
+    ret = lite::PreprocessParser::ParsePreprocess(config_parser.GetDataPreProcessString(), &param->dataPreProcessParam);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Parse preprocess failed.";
       return ret;
     }
-    ret = lite::QuantParamParser::ParseCommonQuant(config_parser->GetCommonQuantString(), &param->commonQuantParam);
+    ret = lite::QuantParamParser::ParseCommonQuant(config_parser.GetCommonQuantString(), &param->commonQuantParam);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Parse common quant param failed.";
       return ret;
     }
-    ret = lite::QuantParamParser::ParseFullQuant(config_parser->GetFullQuantString(), &param->fullQuantParam);
+    ret = lite::QuantParamParser::ParseFullQuant(config_parser.GetFullQuantString(), &param->fullQuantParam);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Parse full quant param failed.";
       return ret;
     }
-    ret = lite::QuantParamParser::ParseWeightQuant(config_parser->GetWeightQuantString(), &param->weightQuantParam);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Parse full quant param failed.";
-      return ret;
-    }
-    ret = lite::QuantParamParser::ParseMixedBitWeightQuant(config_parser->GetMixedBitWeightQuantString(),
+    ret = lite::QuantParamParser::ParseMixedBitWeightQuant(config_parser.GetMixedBitWeightQuantString(),
                                                            &param->mixedBitWeightQuantParam);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Parse mixed bit weight quant param failed.";
       return ret;
     }
-    ret = lite::ThirdPartyParamParser::Parse(config_parser->GetThirdPartyModelString(),
-                                             &param->thirdPartyModelParam);
+    ret = lite::ThirdPartyParamParser::Parse(config_parser.GetThirdPartyModelString(), &param->thirdPartyModelParam);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Parse third party param failed.";
       return ret;
     }
-    ret = InitExtendedIntegrationInfo(param, *config_parser);
+    ret = InitExtendedIntegrationInfo(param, config_parser);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Parse extended integration info failed.";
       return ret;
@@ -490,7 +471,7 @@ int ConverterImpl::ParseParam(lite::ConfigFileParser *config_parser, const std::
     param->aclModelOptionCfgParam.dump_model_name =
       dir_pos != std::string::npos ? output_file.substr(dir_pos + 1) : output_file;
     lite::AclOptionParamParser acl_param_parser;
-    ret = acl_param_parser.ParseAclOptionCfg(config_parser->GetAclOptionCfgString(), &param->aclModelOptionCfgParam);
+    ret = acl_param_parser.ParseAclOptionCfg(config_parser.GetAclOptionCfgString(), &param->aclModelOptionCfgParam);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Parse acl option param failed.";
       return ret;
@@ -498,14 +479,14 @@ int ConverterImpl::ParseParam(lite::ConfigFileParser *config_parser, const std::
     // parse ascend_context in config file, the priority is higher
     if (maps.find("ascend_context") != maps.end()) {
       auto map = maps.at("ascend_context");
-      config_parser->SetParamByConfigfile(param, map);
+      config_parser.SetParamByConfigfile(param, map);
     }
     if (!param->config_file.empty()) {
       (void)CheckOfflineParallelConfig(param->config_file, &param->parallel_split_config);
     }

     lite::CpuOptionParamParser cpu_param_parser;
-    ret = cpu_param_parser.ParseCpuOptionCfg(config_parser->GetCpuOptionCfgString(), &param->cpuOptionCfgParam);
+    ret = cpu_param_parser.ParseCpuOptionCfg(config_parser.GetCpuOptionCfgString(), &param->cpuOptionCfgParam);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "Parse cpu option param failed.";
       return ret;
@@ -515,29 +496,29 @@ int ConverterImpl::ParseParam(lite::ConfigFileParser *config_parser, const std::
     << "If there are multi models, only support micro_param and model_param, other configure can not take effect";

   lite::MicroParamParser micro_param_parser;
-  ret = micro_param_parser.ParseMicroParam(config_parser->GetMicroParamString(), &param->microParam);
+  ret = micro_param_parser.ParseMicroParam(config_parser.GetMicroParamString(), &param->microParam);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Parse micro param failed.";
     return ret;
   }
   ret =
-    lite::QuantParamParser::ParseTransformQuant(config_parser->GetTransformQuantString(), &param->transformQuantParam);
+    lite::QuantParamParser::ParseTransformQuant(config_parser.GetTransformQuantString(), &param->transformQuantParam);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Parse transform quant param failed.";
     return ret;
   }
-  ret = lite::QuantParamParser::ParseAscendQuant(config_parser->GetAscendQuantString(), &param->ascendQuantParam);
+  ret = lite::QuantParamParser::ParseAscendQuant(config_parser.GetAscendQuantString(), &param->ascendQuantParam);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Parse ascend quant param failed.";
     return ret;
   }
-  ret = lite::QuantParamParser::ParseDynamicQuant(config_parser->GetDynamicQuantString(), &param->dynamicQuantParam);
+  ret = lite::QuantParamParser::ParseDynamicQuant(config_parser.GetDynamicQuantString(), &param->dynamicQuantParam);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Parse dynamic quant param failed.";
     return ret;
   }
   lite::GraphKernelParamParser graph_kernel_parser;
-  ret = graph_kernel_parser.ParseGraphKernelCfg(config_parser->GetGraphKernelString(), &param->graphKernelParam);
+  ret = graph_kernel_parser.ParseGraphKernelCfg(config_parser.GetGraphKernelString(), &param->graphKernelParam);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Parse graph kernel param failed.";
     return ret;
@@ -708,9 +689,9 @@ int CheckFmkType(const std::shared_ptr<ConverterPara> &param) {
   if (param != nullptr) {
     return RET_OK;
   }
-  std::set kValidFmkTypes = {FmkType::kFmkTypeTf,    FmkType::kFmkTypeCaffe,  FmkType::kFmkTypeOnnx,
-                           FmkType::kFmkTypeMs,    FmkType::kFmkTypeTflite, FmkType::kFmkTypePytorch,
-                           FmkType::kFmkTypeMsLite, FmkType::kFmkTypeThirdParty};
+  std::set kValidFmkTypes = {FmkType::kFmkTypeTf,     FmkType::kFmkTypeCaffe,     FmkType::kFmkTypeOnnx,
+                             FmkType::kFmkTypeMs,     FmkType::kFmkTypeTflite,    FmkType::kFmkTypePytorch,
+                             FmkType::kFmkTypeMsLite, FmkType::kFmkTypeThirdParty};
   if (kValidFmkTypes.find(param->fmk_type) == kValidFmkTypes.end()) {
     MS_LOG(ERROR) << "INPUT ILLEGAL: fmk_type must be "
                      "TF|CAFFE|ONNX|MS|TFLITE|PYTORCH|MSLITE|THIRDPARTY"
@@ -1010,7 +991,6 @@ int ConverterImpl::Convert(const std::shared_ptr<ConverterPara> &param, void **m
       model_index++;
     }
   }
-
   return RET_OK;
 }

@@ -1045,7 +1025,6 @@ int ConverterImpl::HandleGraphCommon(const std::shared_ptr<ConverterPara> &param
     MS_LOG(ERROR) << "Save graph failed: " << ret << " " << GetErrorInfo(ret);
     return ret;
   }
-
   return RET_OK;
 }

@@ -1067,8 +1046,8 @@ int ConverterImpl::ExecuteMicro(const schema::MetaGraphT *meta_graph, const std:
   }
   auto status =
     meta_graph != nullptr
-      ? micro::Coder::MicroSourceCodeGeneration(*meta_graph, output_path, param->microParam, param->weight_fp16)
-      : micro::Coder::MicroSourceCodeGeneration(param->model_file, output_path, param->microParam, param->weight_fp16);
+      ? micro::Coder::MicroSourceCodeGeneration(*meta_graph, output_path, &param->microParam, param->weight_fp16)
+      : micro::Coder::MicroSourceCodeGeneration(param->model_file, output_path, &param->microParam, param->weight_fp16);
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Execute Micro failed.";
   }
@@ -1123,7 +1102,6 @@ int ConverterImpl::SaveGraph(FuncGraphPtr graph, const std::shared_ptr<Converter
     MS_LOG(ERROR) << "Save failed:" << status << " " << GetErrorInfo(status);
     return status;
   }
-
   return RET_OK;
 }

diff --git a/mindspore/lite/tools/converter/import/mindspore_importer.cc b/mindspore/lite/tools/converter/import/mindspore_importer.cc
index 1d5afde4..aee0c854 100644
--- a/mindspore/lite/tools/converter/import/mindspore_importer.cc
+++ b/mindspore/lite/tools/converter/import/mindspore_importer.cc
@@ -39,6 +39,7 @@
 #include "tools/optimizer/graph/redundant_op_remove_pass.h"
 #include "nnacl/op_base.h"
 #include "src/common/common.h"
+#include "tools/converter/import/to_custom_op_pass.h"

 namespace mindspore::lite {
 namespace {
@@ -89,6 +90,13 @@ STATUS MindsporeImporter::Mindir2AnfAdjust(const FuncGraphPtr &func_graph,
     ReturnCode::GetSingleReturnCode()->UpdateReturnCode(RET_ERROR);
     return RET_ERROR;
   }
+  auto to_custom_op_pass = std::make_shared<mindspore::opt::ToCustomOpPass>();
+  MS_CHECK_TRUE_MSG(to_custom_op_pass != nullptr, RET_NULL_PTR, "to_custom_op_pass is nullptr.");
+  if (!to_custom_op_pass->Run(func_graph)) {
+    MS_LOG(ERROR) << "To custom op pass run failed!";
+    ReturnCode::GetSingleReturnCode()->UpdateReturnCode(RET_ERROR);
+    return RET_ERROR;
+  }
   return RET_OK;
 }

diff --git a/mindspore/lite/tools/converter/import/to_custom_op_pass.cc b/mindspore/lite/tools/converter/import/to_custom_op_pass.cc
new file mode 100644
index 00000000..55e524e6
--- /dev/null
+++ b/mindspore/lite/tools/converter/import/to_custom_op_pass.cc
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/converter/import/to_custom_op_pass.h"
+#include "ops/grad/gather_d_grad_v2.h"
+#include "ops/masked_fill.h"
+#include "ops/custom.h"
+#include "ops/op_utils.h"
+#include "mindspore/ccsrc/include/common/utils/utils.h"
+#include "nnacl/custom_gather_d_grad_v2_parameter.h"
+
+using mindspore::ops::kNameGatherDGradV2;
+using mindspore::ops::kNameMaskedFill;
+
+namespace mindspore {
+namespace opt {
+bool ToCustomOpPass::Run(const FuncGraphPtr &graph) {
+  MS_ASSERT(graph != nullptr);
+  auto manager = graph->manager();
+  MS_ASSERT(manager != nullptr);
+  auto node_list = TopoSort(graph->get_return());
+
+  for (auto &node : node_list) {
+    if (!utils::isa<CNodePtr>(node)) {
+      continue;
+    }
+    auto cnode = node->cast<CNodePtr>();
+    MS_ASSERT(cnode != nullptr);
+    auto value_node = cnode->input(0);
+    auto prim = GetValueNode<PrimitivePtr>(value_node);
+    if (prim == nullptr) {
+      MS_LOG(DEBUG) << "this is a call cnode, which input[0] is fg.";
+      continue;
+    }
+
+    auto func = ToCustomOpRegistry::GetInstance()->GetToCustomOpFunc(prim->name());
+    if (func == nullptr) {
+      continue;
+    }
+
+    auto ret = func(cnode);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "failed to convert normal cnode node to custom cnode";
+      return false;
+    }
+  }
+  return true;
+}
+
+int GatherDGradV2ToCustomOp(const CNodePtr &cnode) {
+  auto ori_prim = ops::GetOperator<ops::GatherDGradV2>(cnode->input(kAnfPrimitiveIndex));
+  auto dim = ori_prim->get_dim();
+  auto dim_str = std::to_string(dim);
+  std::map<std::string, std::vector<uint8_t>> attrs;
+  attrs["dim"] = std::vector<uint8_t>(dim_str.begin(), dim_str.end());
+  auto custom_prim = std::make_shared<mindspore::ops::Custom>();
+  custom_prim->set_type(kNameGatherDGradV2);
+  cnode->set_input(kAnfPrimitiveIndex, NewValueNode(custom_prim->GetPrim()));
+  custom_prim->set_attr(attrs);
+  return RET_OK;
+}
+
+int MaskedFillToCustomOp(const CNodePtr &cnode) {
+  auto custom_prim = std::make_shared<mindspore::ops::Custom>();
+  custom_prim->set_type(kNameMaskedFill);
+  cnode->set_input(kAnfPrimitiveIndex, NewValueNode(custom_prim->GetPrim()));
+  return RET_OK;
+}
+
+REGISTER_TO_CUSTOM_OP(kNameGatherDGradV2, GatherDGradV2ToCustomOp);
+REGISTER_TO_CUSTOM_OP(kNameMaskedFill, MaskedFillToCustomOp);
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/lite/tools/converter/import/to_custom_op_pass.h b/mindspore/lite/tools/converter/import/to_custom_op_pass.h
new file mode 100644
index 00000000..7108e48b
--- /dev/null
+++ b/mindspore/lite/tools/converter/import/to_custom_op_pass.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_IMPORT_TO_CUSTOM_OP_PASS_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_IMPORT_TO_CUSTOM_OP_PASS_H_
+#include <string>
+#include "backend/common/optimizer/pass.h"
+#include "tools/optimizer/common/gllo_utils.h"
+
+namespace mindspore {
+namespace opt {
+
+typedef int (*ToCustomOpFunc)(const CNodePtr &cnode);
+class ToCustomOpRegistry {
+ public:
+  static ToCustomOpRegistry *GetInstance() {
+    static ToCustomOpRegistry registry;
+    return &registry;
+  }
+
+  void InsertToCustomOpMap(const std::string &key, ToCustomOpFunc creator) { to_custom_op_funcs_[key] = creator; }
+
+  ToCustomOpFunc GetToCustomOpFunc(const std::string &key) {
+    if (to_custom_op_funcs_.find(key) != to_custom_op_funcs_.end()) {
+      return to_custom_op_funcs_[key];
+    } else {
+      MS_LOG(DEBUG) << "Unsupported primitive type : " << key;
+      return nullptr;
+    }
+  }
+
+ protected:
+  std::map<std::string, ToCustomOpFunc> to_custom_op_funcs_;
+};
+
+class RegistryToCustomOp {
+ public:
+  RegistryToCustomOp(const std::string &key, ToCustomOpFunc creator) {
+    ToCustomOpRegistry::GetInstance()->InsertToCustomOpMap(key, creator);
+  }
+  virtual ~RegistryToCustomOp() = default;
+};
+
+#define REGISTER_TO_CUSTOM_OP(type, to_custom_op_func) \
+  RegistryToCustomOp g_##type##_to_custom_op(type, to_custom_op_func);
+
+class ToCustomOpPass : public Pass {
+ public:
+  ToCustomOpPass() : Pass("ToCustomOpPass") {}
+  ~ToCustomOpPass() = default;
+  bool Run(const FuncGraphPtr &graph) override;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_IMPORT_TO_CUSTOM_OP_PASS_H_
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/fusion/fusion_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/fusion/fusion_pass.cc
index 8ea838cf..a551196d 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/fusion/fusion_pass.cc
+++ b/mindspore/lite/tools/converter/legacy_optimizer/fusion/fusion_pass.cc
@@ -287,7 +287,6 @@ bool FusionPass::MatchTree(const schema::MetaGraphT &graph, size_t nodeIdx, cons
 bool FusionPass::CheckMatchParams(const schema::MetaGraphT &graph, size_t nodeIdx,
                                   const std::shared_ptr<PatternOp> &target, const std::vector<size_t> &sinkIdes,
                                   const std::vector<size_t> &pathSinkIdes) {
-  MS_ASSERT(target != nullptr);
   MS_ASSERT(nodeIdx < graph.nodes.size());
   auto &scope = graph.nodes.at(nodeIdx);
   MS_CHECK_TRUE_MSG(scope != nullptr, false, "Node in graph is nullptr");
diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc
index 371e93fb..ff99f1f4 100644
--- a/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc
+++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc
@@ -660,7 +660,9 @@ int InferShapePass::InitSearchTensor(const int64_t &subgraph_index, MetaGraphT *
   }
   auto &subgraph = graph->subGraph.at(subgraph_index);
   for (uint32_t i = 0; i < tensors_.size(); i++) {
-    if (IsContain(subgraph->inputIndices, i) || !graph->allTensors.at(i)->data.empty()) {
+    if (IsContain(subgraph->inputIndices, i) || !graph->allTensors.at(i)->data.empty() ||
+        (graph->allTensors.at(i)->nodeType == NodeType_ValueNode && graph->allTensors.at(i)->dims.size() == 1 &&
+         graph->allTensors.at(i)->dims[0] == 0)) {
       tensors_[i].is_inferred_ = true;
     }
   }
diff --git a/mindspore/lite/tools/converter/micro/cmake/file_list.cmake b/mindspore/lite/tools/converter/micro/cmake/file_list.cmake
index c132460e..5dcf0bb7 100644
--- a/mindspore/lite/tools/converter/micro/cmake/file_list.cmake
+++ b/mindspore/lite/tools/converter/micro/cmake/file_list.cmake
@@ -4,6 +4,8 @@ set(CODER_SRC
         ${MICRO_DIR}/coder/context.cc
         ${MICRO_DIR}/coder/graph.cc
         ${MICRO_DIR}/coder/session.cc
+        ${MICRO_DIR}/coder/shape_info_container.cc
+        ${MICRO_DIR}/coder/dynamic_mem_manager.cc
         ${MICRO_DIR}/coder/utils/coder_utils.cc
         ${MICRO_DIR}/coder/utils/dir_utils.cc
         ${MICRO_DIR}/coder/utils/train_utils.cc
@@ -23,6 +25,7 @@ set(CODER_ALLOCATOR_SRC
 set(CODER_GENERATOR_SRC
         ${MICRO_DIR}/coder/generator/generator.cc
         ${MICRO_DIR}/coder/generator/inference/inference_generator.cc
+        ${MICRO_DIR}/coder/generator/component/allocator_component.cc
         ${MICRO_DIR}/coder/generator/component/common_component.cc
         ${MICRO_DIR}/coder/generator/component/weight_component.cc
         ${MICRO_DIR}/coder/generator/component/allocator_component.cc
@@ -66,6 +69,8 @@ set(CODER_OPCODERS_SRC
         ${MICRO_DIR}/coder/opcoders/base/stack_base_coder.cc
         ${MICRO_DIR}/coder/opcoders/base/unstack_base_coder.cc
         ${MICRO_DIR}/coder/opcoders/base/strided_slice_base_coder.cc
+        ${MICRO_DIR}/coder/opcoders/base/reshape_dynamic_base_coder.cc
+        ${MICRO_DIR}/coder/opcoders/base/strided_slice_dynamic_base_coder.cc
         #### cmsis int8 coder
         ${MICRO_DIR}/coder/opcoders/cmsis-nn/int8/add_int8_coder.cc
         ${MICRO_DIR}/coder/opcoders/cmsis-nn/int8/conv2d_base_coder.cc
@@ -81,23 +86,37 @@ set(CODER_OPCODERS_SRC
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/arithmetic_fp16_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/avg_pooling_fp16_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/concat_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv2d_delegate_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_3x3_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_sw_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_1x1_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_winograd_fp16_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/custom_gru_fp16_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/deconv2d_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/lstm_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_fp16_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/resize_fp16_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/transpose_fp16_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/layernorm_fp16_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/reduce_fp16_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp16/resize_fp16_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_fp16_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv2d_delegate_fp16_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_fp16_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_fp16_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_winograd_fp16_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_1x1_fp16_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_3x3_fp16_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_sw_fp16_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/lstm_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.cc
         #### nnacl fp32 coder
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/activation_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/addn_fp32_coder.cc
@@ -122,6 +141,7 @@ set(CODER_OPCODERS_SRC
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/lstm_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/matmul_fp32_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/ones_like_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/pad_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/pooling_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/power_fp32_coder.cc
@@ -133,17 +153,14 @@ set(CODER_OPCODERS_SRC
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/transpose_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/splice_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/exp_fp32_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/fill_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/prelu_fp32_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/fp32/layernorm_fp32_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/ones_like_fp32_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/fill_fp32_coder.cc
-        #### nnacl fp32_grad coder
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/activation_grad_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/adam_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/assign_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/biasadd_grad_coder.cc
-        ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/softmax_cross_entropy_with_logits_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.cc
+        ${MICRO_DIR}/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.cc
         #### nnacl int8 coder
         ${MICRO_DIR}/coder/opcoders/nnacl/int8/activation_int8_coder.cc
         ${MICRO_DIR}/coder/opcoders/nnacl/int8/affine_int8_coder.cc
diff --git a/mindspore/lite/tools/converter/micro/coder/coder.cc b/mindspore/lite/tools/converter/micro/coder/coder.cc
index cc224ae5..a502500d 100644
--- a/mindspore/lite/tools/converter/micro/coder/coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/coder.cc
@@ -42,6 +42,34 @@ std::shared_ptr<CoderSession> CreateCoderSession() {
   }
   return session;
 }
+
+int ParseMicroDynamicShape(const schema::MetaGraphT &graph, micro::MicroParam *micro_param) {
+  for (auto index : graph.inputIndex) {
+    auto input_name = graph.allTensors.at(index)->name;
+    if (micro_param->graph_inputs_origin_info.find(input_name) == micro_param->graph_inputs_origin_info.end() ||
+        micro_param->inputs_shape_by_scenes.find(input_name) == micro_param->inputs_shape_by_scenes.end()) {
+      MS_LOG(ERROR) << "Micro param: dynamic inputs name is invalid";
+      return RET_INPUT_PARAM_INVALID;
+    }
+    micro_param->graph_inputs_template.emplace_back(micro_param->graph_inputs_origin_info[input_name]);
+    micro_param->graph_inputs_shape_infos.emplace_back(micro_param->inputs_shape_by_scenes[input_name]);
+  }
+  return RET_OK;
+}
+
+int ParseMicroDynamicShape(const Model &model, micro::MicroParam *micro_param) {
+  for (auto index : model.graph_.input_indices_) {
+    auto input_name = model.graph_.all_tensors_.at(index)->name()->str();
+    if (micro_param->graph_inputs_origin_info.find(input_name) == micro_param->graph_inputs_origin_info.end() ||
+        micro_param->inputs_shape_by_scenes.find(input_name) == micro_param->inputs_shape_by_scenes.end()) {
+      MS_LOG(ERROR) << "Micro param: dynamic inputs name is invalid";
+      return RET_INPUT_PARAM_INVALID;
+    }
+    micro_param->graph_inputs_template.emplace_back(micro_param->graph_inputs_origin_info[input_name]);
+    micro_param->graph_inputs_shape_infos.emplace_back(micro_param->inputs_shape_by_scenes[input_name]);
+  }
+  return RET_OK;
+}
 }  // namespace
 int Coder::Run(const void *model_buff, size_t size, const std::string &model_name, bool end_flag, bool enable_fp16) {
   session_ = CreateCoderSession();
@@ -109,29 +137,37 @@ bool Coder::InitPath(const std::string &output_path) {
   return true;
 }

-int Coder::MicroSourceCodeGeneration(const schema::MetaGraphT &graph, const std::string &output_path,
-                                     const MicroParam &param, bool enable_fp16) {
+int Coder::MicroSourceCodeGeneration(const schema::MetaGraphT &graph, const std::string &output_path, MicroParam *param,
+                                     bool enable_fp16) {
   flatbuffers::FlatBufferBuilder builder(kFlatbuffersBuilderInitSize);
   auto offset = schema::MetaGraph::Pack(builder, &graph);
   builder.Finish(offset);
   schema::FinishMetaGraphBuffer(builder, offset);
   size_t size = builder.GetSize();
-  if (ExecuteMicroGeneration(builder.GetBufferPointer(), size, output_path, param, enable_fp16) != RET_OK) {
+  if (!param->dynamic_symbols.empty()) {
+    MS_CHECK_TRUE_MSG(ParseMicroDynamicShape(graph, param) == RET_OK, RET_ERROR, "ParseMicroDynamicShape failed.");
+  }
+  if (ExecuteMicroGeneration(builder.GetBufferPointer(), size, output_path, *param, enable_fp16) != RET_OK) {
     MS_LOG(ERROR) << "Execute Micro failed.";
     return RET_ERROR;
   }
   return RET_OK;
 }

-int Coder::MicroSourceCodeGeneration(const std::string &model_file, const std::string &output_path,
-                                     const MicroParam &param, bool enable_fp16) {
+int Coder::MicroSourceCodeGeneration(const std::string &model_file, const std::string &output_path, MicroParam *param,
+                                     bool enable_fp16) {
   size_t buffer_size;
   auto model_buf = lite::ReadFile(model_file.c_str(), &buffer_size);
   if (model_buf == nullptr) {
     MS_LOG(ERROR) << "Read model-file failed.";
     return RET_NULL_PTR;
   }
-  auto ret = ExecuteMicroGeneration(model_buf, buffer_size, output_path, param, enable_fp16);
+  Model *model = lite::Model::Import(model_buf, buffer_size);
+  MS_CHECK_PTR(model);
+  if (!param->dynamic_symbols.empty()) {
+    MS_CHECK_TRUE_MSG(ParseMicroDynamicShape(*model, param) == RET_OK, RET_ERROR, "ParseMicroDynamicShape failed.");
+  }
+  auto ret = ExecuteMicroGeneration(model_buf, buffer_size, output_path, *param, enable_fp16);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Execute Micro failed.";
   }
@@ -199,6 +235,10 @@ int Coder::Init(const MicroParam &param) const {
                         DirectoryGenerator::GetInstance()->project_name());
   config->set_keep_original_weight(param.keep_original_weight);
   config->set_changeable_weights_name(param.changeable_weights_name);
+  config->set_graph_inputs_shape_infos(param.graph_inputs_shape_infos);
+  config->set_dynamic_symbols(param.dynamic_symbols);
+  config->set_dynamic_symbols_num(param.dynamic_symbols_num);
+  config->set_user_graph_inputs_template(param.graph_inputs_template);

   auto print_parameter = [](auto name, auto value) {
     MS_LOG(INFO) << std::setw(20) << std::left << name << "= " << value;
@@ -209,6 +249,7 @@ int Coder::Init(const MicroParam &param) const {
   print_parameter("codePath", config->code_path());
   print_parameter("codeMode", config->code_mode());
   print_parameter("debugMode", config->debug_mode());
+  print_parameter("keepOriginalWeight", config->keep_original_weight());
   return RET_OK;
 }
 }  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/coder.h b/mindspore/lite/tools/converter/micro/coder/coder.h
index c360f4c1..fad479aa 100644
--- a/mindspore/lite/tools/converter/micro/coder/coder.h
+++ b/mindspore/lite/tools/converter/micro/coder/coder.h
@@ -31,9 +31,9 @@ class Coder final {

   ~Coder() = default;
   static int MicroSourceCodeGeneration(const schema::MetaGraphT &graph, const std::string &output_path,
-                                       const MicroParam &param, bool enable_fp16);
-  static int MicroSourceCodeGeneration(const std::string &model_file, const std::string &output_path,
-                                       const MicroParam &param, bool enable_fp16);
+                                       MicroParam *param, bool enable_fp16);
+  static int MicroSourceCodeGeneration(const std::string &model_file, const std::string &output_path, MicroParam *param,
+                                       bool enable_fp16);

  private:
   static int ExecuteMicroGeneration(const void *model_buf, size_t size, const std::string &output_path,
diff --git a/mindspore/lite/tools/converter/micro/coder/config.h b/mindspore/lite/tools/converter/micro/coder/config.h
index 9be56178..fb90a2fc 100644
--- a/mindspore/lite/tools/converter/micro/coder/config.h
+++ b/mindspore/lite/tools/converter/micro/coder/config.h
@@ -34,6 +34,12 @@ struct MicroParam {
   std::string project_name;
   bool is_last_model{false};
   bool keep_original_weight{false};
+  std::vector<std::vector<std::string>> graph_inputs_template;
+  std::map<std::string, std::vector<std::string>> graph_inputs_origin_info;
+  std::vector<std::string> dynamic_symbols;
+  std::vector<size_t> dynamic_symbols_num;
+  std::vector<std::vector<std::vector<int>>> graph_inputs_shape_infos;
+  std::map<std::string, std::vector<std::vector<int>>> inputs_shape_by_scenes;
 };

 class Configurator {
@@ -67,6 +73,29 @@ class Configurator {
   void set_changeable_weights_name(const std::string &weights_name) { changeable_weights_name_ = weights_name; }
   const std::string &changeable_weights_name() const { return changeable_weights_name_; }

+  void set_dynamic_shape(bool dynamic_shape) { dynamic_shape_ = dynamic_shape; }
+  bool dynamic_shape() const { return dynamic_shape_; }
+
+  void set_dynamic_symbols(const std::vector<std::string> &dynamic_symbols) { dynamic_symbols_ = dynamic_symbols; }
+  const std::vector<std::string> &dynamic_symbols() const { return dynamic_symbols_; }
+
+  void set_dynamic_symbols_num(const std::vector<size_t> &dynamic_symbols_num) {
+    dynamic_symbols_num_ = dynamic_symbols_num;
+  }
+  const std::vector<size_t> &dynamic_symbols_num() const { return dynamic_symbols_num_; }
+
+  void set_user_graph_inputs_template(const std::vector<std::vector<std::string>> &graph_inputs_template) {
+    user_graph_inputs_template_ = graph_inputs_template;
+  }
+  const std::vector<std::vector<std::string>> &user_graph_inputs_template() const {
+    return user_graph_inputs_template_;
+  }
+
+  void set_graph_inputs_shape_infos(const std::vector<std::vector<std::vector<int>>> &graph_inputs_shape_infos) {
+    graph_inputs_shape_infos_ = graph_inputs_shape_infos;
+  }
+  const std::vector<std::vector<std::vector<int>>> &graph_inputs_shape_infos() { return graph_inputs_shape_infos_; }
+
  private:
   Configurator() = default;
   ~Configurator() = default;
@@ -76,8 +105,13 @@ class Configurator {
   bool support_parallel_{false};
   bool debug_mode_{false};
   bool keep_original_weight_{false};
+  bool dynamic_shape_{false};
   std::string proj_dir_;
   std::string changeable_weights_name_;
+  std::vector<std::string> dynamic_symbols_;
+  std::vector<size_t> dynamic_symbols_num_;
+  std::vector<std::vector<std::vector<int>>> graph_inputs_shape_infos_;
+  std::vector<std::vector<std::string>> user_graph_inputs_template_;
 };
 }  // namespace mindspore::lite::micro
 #endif  // MICRO_CODER_CONFIG_H
diff --git a/mindspore/lite/tools/converter/micro/coder/context.cc b/mindspore/lite/tools/converter/micro/coder/context.cc
index 251b282f..7e7f640e 100644
--- a/mindspore/lite/tools/converter/micro/coder/context.cc
+++ b/mindspore/lite/tools/converter/micro/coder/context.cc
@@ -50,4 +50,17 @@ std::vector<std::string> CoderContext::GetInitWeightSizeCode() const {
 }

 void CoderContext::AppendInitWeightSizeCode(size_t w_buf_size) { weight_buffer_size_ += w_buf_size; }
+
+const std::map<int, std::vector<int>> &CoderContext::shape_all_scenes() const {
+  return shape_info_container_->GetShapesWholeScenes();
+}
+const std::map<const Tensor *, std::vector<std::string>> &CoderContext::shape_templates() {
+  return shape_info_container_->GetWholeTemplateShape();
+}
+const std::map<int, std::vector<size_t>> &CoderContext::offset_all_scenes() {
+  return dynamic_mem_manager_->GetOffsetAllScenes();
+}
+const std::vector<size_t> &CoderContext::buffer_sizes() const { return dynamic_mem_manager_->GetBufferSizes(); }
+const std::vector<size_t> &CoderContext::workspaces() const { return dynamic_mem_manager_->GetWorkSpaces(); }
+std::string CoderContext::tensor_addr(const Tensor *tensor) { return dynamic_mem_manager_->GetVarTensorAddr(tensor); }
 }  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/context.h b/mindspore/lite/tools/converter/micro/coder/context.h
index bad4ab40..b511eac1 100644
--- a/mindspore/lite/tools/converter/micro/coder/context.h
+++ b/mindspore/lite/tools/converter/micro/coder/context.h
@@ -25,6 +25,8 @@
 #include <vector>
 #include <algorithm>
 #include "src/tensor.h"
+#include "tools/converter/micro/coder/shape_info_container.h"
+#include "tools/converter/micro/coder/dynamic_mem_manager.h"

 namespace mindspore::lite::micro {
 class CoderContext {
@@ -146,6 +148,17 @@ class CoderContext {

   bool end_flag() { return end_flag_; }

+  void set_shape_info_container(ShapeInfoContainer *shape_info_container) {
+    shape_info_container_ = shape_info_container;
+  }
+  void set_dynamic_mem_manager(DynamicMemManager *dynamic_mem_manager) { dynamic_mem_manager_ = dynamic_mem_manager; }
+  const std::map<int, std::vector<int>> &shape_all_scenes() const;
+  const std::map<const Tensor *, std::vector<std::string>> &shape_templates();
+  const std::map<int, std::vector<size_t>> &offset_all_scenes();
+  const std::vector<size_t> &buffer_sizes() const;
+  const std::vector<size_t> &workspaces() const;
+  std::string tensor_addr(const Tensor *tensor);
+
  private:
   std::string model_name_;
   std::vector<Tensor *> graph_inputs_;
@@ -195,6 +208,8 @@ class CoderContext {
   // operator C Lang files list, depended by the net.c. it will be add to CMakeLists.txt
   static std::set<std::string> c_files_;
   static size_t max_buffer_size_;
+  ShapeInfoContainer *shape_info_container_;
+  DynamicMemManager *dynamic_mem_manager_;
 };
 }  // namespace mindspore::lite::micro
 #endif  // MINDSPORE_LITE_MICRO_CODER_CONTEXT_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.cc b/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.cc
new file mode 100644
index 00000000..976bd852
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.cc
@@ -0,0 +1,116 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/dynamic_mem_manager.h"
+#include <vector>
+#include "coder/allocator/memory_manager.h"
+#include "coder/generator/component/component.h"
+
+namespace mindspore::lite::micro {
+int DynamicMemManager::AllocDynamicMem(const std::vector<std::unique_ptr<OperatorCoder>> &nodes,
+                                       const std::vector<Tensor *> &graph_inputs,
+                                       const std::vector<Tensor *> &graph_outputs,
+                                       const ShapeInfoContainer *shape_info_container) {
+  MS_CHECK_TRUE_MSG(shape_info_container, RET_NULL_PTR, "ShapeInfoContainer is a nullptr.");
+  for (size_t i = 0; i < graph_inputs.size(); ++i) {
+    graph_inputs_.insert(std::make_pair(graph_inputs.at(i), kInputPrefixName + std::to_string(i)));
+  }
+  auto var_tensor_shapes = shape_info_container->GetVarTensorInfos();
+  MS_CHECK_TRUE_MSG(!var_tensor_shapes.empty(), RET_ERROR, "Cannot get var-tensor's shape-info");
+  auto scene_num = var_tensor_shapes.begin()->second.size();
+  for (const auto &item : var_tensor_shapes) {
+    MS_CHECK_TRUE_MSG(item.first, RET_NULL_PTR, "Find a nullptr in shape-infos");
+    MS_CHECK_TRUE_MSG(item.second.size() == scene_num, RET_ERROR, "Shape-info is invalid.");
+  }
+  for (size_t i = 0; i < scene_num; ++i) {
+    for (const auto &item : var_tensor_shapes) {
+      item.first->ResetRefCount();
+      item.first->set_shape(item.second[i]);
+    }
+    auto ret = AllocDynamicMemCore(nodes, graph_outputs, i);
+    MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Alloc dynamic memory failed.");
+  }
+  return RET_OK;
+}
+
+int DynamicMemManager::AllocDynamicMemCore(const std::vector<std::unique_ptr<OperatorCoder>> &nodes,
+                                           const std::vector<Tensor *> &graph_outputs, int scene_index) {
+  if (offsets_all_scenes_.find(scene_index) != offsets_all_scenes_.end()) {
+    MS_LOG(ERROR) << "Current scene has been processed.";
+    return RET_ERROR;
+  }
+  auto manager = std::make_unique<MemoryManager>();
+  int ret = manager->AssignMemory(nodes, graph_outputs);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "assign memory failed";
+    return RET_ERROR;
+  }
+  std::map<Tensor *, size_t> offsets = manager->variables_offset();
+  if (offset_index_.empty()) {
+    int index = 0;
+    for (auto &item : offsets) {
+      offset_index_[item.first] = index++;
+      offsets_all_scenes_[scene_index].push_back(item.second);
+    }
+  } else {
+    MS_CHECK_TRUE_MSG(offsets.size() == offset_index_.size(), RET_ERROR, "Tensors num is not same.");
+    for (auto &item : offsets) {
+      MS_CHECK_TRUE_MSG(offset_index_.find(item.first) != offset_index_.end(), RET_ERROR, "Tensor cannot be found.");
+      offsets_all_scenes_[scene_index].push_back(item.second);
+    }
+  }
+  buffer_sizes_.push_back(manager->GetAllocatedSize());
+  offsets_all_scenes_[scene_index].push_back(manager->GetAllocatedSize());
+  return RET_OK;
+}
+
+std::string DynamicMemManager::GetVarTensorAddr(const Tensor *tensor) const {
+  if (graph_inputs_.find(tensor) != graph_inputs_.end()) {
+    return graph_inputs_.at(tensor);
+  }
+  if (offset_index_.find(tensor) == offset_index_.end()) {
+    return "";
+  }
+  if (kBufferPrefixName == nullptr || kOffsetPrefixName == nullptr) {
+    MS_LOG(ERROR) << "Buffer or Offset is a nullptr.";
+    return "";
+  }
+  return std::string(kBufferPrefixName) + " + " + kOffsetPrefixName + "[" + std::to_string(offset_index_.at(tensor)) +
+         "]";
+}
+
+std::string DynamicMemManager::AllocWorkSpace(size_t size, int index) {
+  if (index < 0 || static_cast<size_t>(index) >= buffer_sizes_.size()) {
+    return "";
+  }
+  if (static_cast<size_t>(index) + 1 >= workspaces_.size()) {
+    workspaces_.insert(workspaces_.end(), index + 1 - workspaces_.size(), 0);
+  }
+  if (workspaces_[index] < size) {
+    workspaces_[index] = size;
+  }
+  if (kBufferPrefixName == nullptr) {
+    MS_LOG(ERROR) << "Buffer is a nullptr.";
+    return "";
+  }
+  if (kOffsetPrefixName == nullptr) {
+    MS_LOG(ERROR) << "Offset is a nullptr.";
+    return "";
+  }
+  return "(" + std::string(kBufferPrefixName) + " + " + kOffsetPrefixName + "[" +
+         std::to_string(offsets_all_scenes_.begin()->second.size() - 1) + "])";
+}
+}  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.h b/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.h
new file mode 100644
index 00000000..6db7cff5
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_DYNAMIC_MEM_MANAGER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_DYNAMIC_MEM_MANAGER_H_
+
+#include <map>
+#include <vector>
+#include "src/tensor.h"
+#include "tools/converter/micro/coder/shape_info_container.h"
+
+namespace mindspore::lite::micro {
+class OperatorCoder;
+class DynamicMemManager {
+ public:
+  DynamicMemManager() = default;
+  virtual ~DynamicMemManager() = default;
+  int AllocDynamicMem(const std::vector<std::unique_ptr<OperatorCoder>> &nodes,
+                      const std::vector<Tensor *> &graph_inputs, const std::vector<Tensor *> &graph_outputs,
+                      const ShapeInfoContainer *shape_info_container);
+
+  std::string GetVarTensorAddr(const Tensor *tensor) const;
+  std::string AllocWorkSpace(size_t size, int index);
+
+  const std::vector<size_t> &GetBufferSizes() const { return buffer_sizes_; }
+  const std::vector<size_t> &GetWorkSpaces() const { return workspaces_; }
+  const std::map<int, std::vector<size_t>> &GetOffsetAllScenes() { return offsets_all_scenes_; }
+
+ private:
+  int AllocDynamicMemCore(const std::vector<std::unique_ptr<OperatorCoder>> &nodes,
+                          const std::vector<Tensor *> &graph_outputs, int scene_index);
+  std::map<int, std::vector<size_t>> offsets_all_scenes_;
+  std::map<const Tensor *, int> offset_index_;
+  std::map<const Tensor *, std::string> graph_inputs_;
+  std::vector<size_t> buffer_sizes_;
+  std::vector<size_t> workspaces_;
+  int model_id_;
+};
+}  // namespace mindspore::lite::micro
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_DYNAMIC_MEM_MANAGER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/cmake_component.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/cmake_component.cc
index 643cf50b..831d4259 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/cmake_component.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/cmake_component.cc
@@ -5,7 +5,7 @@
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.objrg/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -29,32 +29,32 @@ void CodeCMakeNetLibrary(std::ofstream &ofs, const std::unique_ptr<CoderContext>
   }
   ofs << "set(OP_SRC\n";
   for (const std::string &c_file : ctx->c_files()) {
-    ofs << "    " << c_file << ".o\n";
+    ofs << "    " << c_file << ".obj\n";
   }
   for (int i = 0; i <= ctx->GetCurModelIndex(); ++i) {
-    ofs << "    weight" << i << ".c.o\n"
-        << "    net" << i << ".c.o\n"
-        << "    model" << i << ".c.o\n";
+    ofs << "    weight" << i << ".c.obj\n"
+        << "    net" << i << ".c.obj\n"
+        << "    model" << i << ".c.obj\n";
   }
-  ofs << "    model.c.o\n"
-      << "    context.c.o\n"
-      << "    tensor.c.o\n";
-  if (config->target() != kCortex_M) {
-    ofs << "    allocator.c.o\n";
+  ofs << "    model.c.obj\n"
+      << "    context.c.obj\n"
+      << "    tensor.c.obj\n";
+  if (config->target() != kCortex_M && !config->dynamic_shape()) {
+    ofs << "    allocator.c.obj\n";
   }
   if (config->debug_mode()) {
-    ofs << "    debug_utils.c.o\n";
+    ofs << "    debug_utils.c.obj\n";
   }
   if (config->support_parallel()) {
-    ofs << "    micro_core_affinity.c.o\n"
-           "    micro_thread_pool.c.o\n";
+    ofs << "    micro_core_affinity.c.obj\n"
+           "    micro_thread_pool.c.obj\n";
   }
   ofs << ")\n";
   std::set<std::string> kernel_cmake_asm_set_files = ctx->asm_files();
   if (!kernel_cmake_asm_set_files.empty() && (config->target() == kARM32 || config->target() == kARM64)) {
     ofs << "set(ASSEMBLY_SRC\n";
     for (const std::string &asm_file : kernel_cmake_asm_set_files) {
-      ofs << "    " << asm_file << ".o\n";
+      ofs << "    " << asm_file << ".obj\n";
     }
     ofs << ")\n"
         << "set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)\n"
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.cc
index 774e8353..62c2f668 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.cc
@@ -16,6 +16,7 @@

 #include "coder/generator/component/common_component.h"
 #include <memory>
+#include "coder/generator/component/const_blocks/license.h"
 #include "coder/generator/component/component.h"
 #include "coder/utils/type_cast.h"
 #include "coder/utils/coder_utils.h"
@@ -23,36 +24,59 @@
 #include "include/errorcode.h"
 #include "nnacl/op_base.h"
 #include "include/c_api/model_c.h"
+#include "tools/common/string_util.h"

 namespace mindspore::lite::micro {
-const char handle_array_destroy_state[] = R"RAW(
-void MSTensorHandleArrayDestroy(MSTensorHandleArray inputs);
+const char model_runtime_init_source[] = R"RAW(
+typedef struct {
+ void *runtime_buffer;
+ OH_AI_TensorHandleArray inputs;
+ OH_AI_TensorHandleArray outputs;
+} MicroModel;
+OH_AI_ModelHandle OH_AI_ModelCreate() {
+ MicroModel *micro_model = (MicroModel *)malloc(sizeof(MicroModel));
+ if (micro_model == NULL) {
+   return NULL;
+ }
+)RAW";
+const char model_runtime_malloc_source[] = R"RAW(
+ int buffer_size = GetBufferSize();
+ void *runtime_buffer = malloc(buffer_size);
+ if (runtime_buffer == NULL) {
+   return NULL;
+ }
+ micro_model->runtime_buffer = runtime_buffer;
+ int ret = SetBuffer(runtime_buffer);
+ if (ret != OH_AI_STATUS_SUCCESS) {
+   return NULL;
+ }
+
 )RAW";

 const char handle_array_destroy[] = R"RAW(
-void MSTensorHandleArrayDestroy(MSTensorHandleArray inputs) {
-  if (inputs.handle_list == NULL) {
-    return;
-  }
-  for (size_t i = 0; i < inputs.handle_num; i++) {
-    MicroTensor *micro_tensor = inputs.handle_list[i];
-    if (micro_tensor == NULL) {
-      continue;
-    }
-    if (micro_tensor->data != NULL && micro_tensor->owned) {
-      free(micro_tensor->data);
-      micro_tensor->data = NULL;
-      micro_tensor->owned = false;
-    }
-    if (micro_tensor->shape != NULL) {
-      free(micro_tensor->shape);
-      micro_tensor->shape = NULL;
-    }
-    free(micro_tensor);
-    micro_tensor = NULL;
-  }
-  free(inputs.handle_list);
-  inputs.handle_list = NULL;
+void OH_AI_TensorHandleArrayDestroy(OH_AI_TensorHandleArray inputs) {
+ if (inputs.handle_list == NULL) {
+   return;
+ }
+ for (size_t i = 0; i < inputs.handle_num; i++) {
+   MicroTensor *micro_tensor = inputs.handle_list[i];
+   if (micro_tensor == NULL) {
+     continue;
+   }
+   if (micro_tensor->data != NULL && micro_tensor->owned) {
+     free(micro_tensor->data);
+     micro_tensor->data = NULL;
+     micro_tensor->owned = false;
+   }
+   if (micro_tensor->shape) {
+     free(micro_tensor->shape);
+     micro_tensor->shape = NULL;
+   }
+   free(micro_tensor);
+   micro_tensor = NULL;
+ }
+ free(inputs.handle_list);
+ inputs.handle_list = NULL;
 }

 )RAW";
@@ -62,7 +86,7 @@ const char cortex_set_workspace[] = R"RAW(
   if (micro_model == NULL) {
     return;
   }
-  if (workspace_size < MSModelCalcWorkspaceSize(model)) {
+  if (workspace_size < OH_AI_ModelCalcWorkspaceSize(model)) {
     return;
   }
   if (micro_model->inputs.handle_num != GRAPH_INPUTS_SIZE) {
@@ -75,29 +99,29 @@ const char cortex_set_workspace[] = R"RAW(
 )RAW";

 const char micro_model_build_state[] = R"RAW(
-typedef MSStatus (*ModelBuild)(MSModelHandle model, const void *model_data,
+typedef OH_AI_Status (*ModelBuild)(OH_AI_ModelHandle model, const void *model_data,
                                size_t data_size,
-                               const MSContextHandle model_context);
+                               const OH_AI_ContextHandle model_context);
 )RAW";

 const char micro_model_build_implement[] = R"RAW(
-MSStatus MSModelBuild(MSModelHandle model, const void *model_data,
-                      size_t data_size, MSModelType model_type,
-                      const MSContextHandle model_context) {
-  if (model_type != kMSModelTypeMindIR) {
-    return kMSStatusLiteNotSupport;
+OH_AI_Status OH_AI_ModelBuild(OH_AI_ModelHandle model, const void *model_data,
+                      size_t data_size, OH_AI_ModelType model_type,
+                      const OH_AI_ContextHandle model_context) {
+  if (model_type != OH_AI_MODELTYPE_MINDIR) {
+    return OH_AI_STATUS_LITE_NOT_SUPPORT;
   }
   if (model == NULL) {
-    return kMSStatusLiteParamInvalid;
+    return OH_AI_STATUS_LITE_PARAM_INVALID;
   }
 )RAW";

 const char micro_model_predict_state[] = R"RAW(
-typedef MSStatus (*ModelPredict)(MSModelHandle model,
-                                 const MSTensorHandleArray inputs,
-                                 MSTensorHandleArray *outputs,
-                                 const MSKernelCallBackC before,
-                                 const MSKernelCallBackC after);
+typedef OH_AI_Status (*ModelPredict)(OH_AI_ModelHandle model,
+                                 const OH_AI_TensorHandleArray inputs,
+                                 OH_AI_TensorHandleArray *outputs,
+                                 const OH_AI_KernelCallBack before,
+                                 const OH_AI_KernelCallBack after);
 )RAW";

 const char free_resource_state[] = R"RAW(
@@ -107,7 +131,7 @@ typedef void (*FreeResource)();
 void CodeMSModelCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx,
                                   const Configurator &config) {
   if (config.target() == kCortex_M) {
-    ofs << "size_t MSModelCalcWorkspaceSize(MSModelHandle model) {\n"
+    ofs << "size_t OH_AI_ModelCalcWorkspaceSize(OH_AI_ModelHandle model) {\n"
         << "  MicroModel *micro_model = (MicroModel *)model;\n"
         << "  if (micro_model == NULL) {\n"
         << "    return 0;\n"
@@ -118,13 +142,13 @@ void CodeMSModelCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr<Code
         << "  return micro_model->calc_work_space(model);\n"
         << "}\n";
   } else {
-    ofs << "size_t MSModelCalcWorkspaceSize(MSModelHandle model) {\n  return 0;\n}\n";
+    ofs << "size_t OH_AI_ModelCalcWorkspaceSize(OH_AI_ModelHandle model) {\n  return 0;\n}\n";
   }
   ofs << "\n";
 }

 void CodeCortexCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx) {
-  ofs << "size_t MSModelCalcWorkspaceSize" << ctx->GetCurModelIndex() << "(MSModelHandle model) {\n"
+  ofs << "size_t OH_AI_ModelCalcWorkspaceSize" << ctx->GetCurModelIndex() << "(OH_AI_ModelHandle model) {\n"
       << "size_t shape_size = 0;\n";
   std::vector<Tensor *> inputs = ctx->graph_inputs();
   for (size_t i = 0; i < inputs.size(); ++i) {
@@ -141,7 +165,7 @@ void CodeCortexCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr<Coder
 }

 void CodeMSModelSetWorkspace(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config) {
-  ofs << "void MSModelSetWorkspace(MSModelHandle model, void *workspace, size_t workspace_size) {";
+  ofs << "void OH_AI_ModelSetWorkspace(OH_AI_ModelHandle model, void *workspace, size_t workspace_size) {";
   if (config.target() == kCortex_M) {
     ofs << "  MicroModel *micro_model = (MicroModel *)model;\n"
         << "  if (micro_model == NULL) {\n"
@@ -156,8 +180,8 @@ void CodeMSModelSetWorkspace(std::ofstream &ofs, const std::unique_ptr<CoderCont
 }

 void CodeCortexSetWorkspace(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx) {
-  ofs << "void MSModelSetWorkspace" << ctx->GetCurModelIndex()
-      << "(MSModelHandle model, void *workspace, size_t workspace_size) {\n";
+  ofs << "void OH_AI_ModelSetWorkspace" << ctx->GetCurModelIndex()
+      << "(OH_AI_ModelHandle model, void *workspace, size_t workspace_size) {\n";
   ofs << cortex_set_workspace;
   ofs << "  micro_model->runtime_buffer = workspace;\n"
          "  int buffer_size = GetBufferSize"
@@ -173,12 +197,12 @@ void CodeCortexSetWorkspace(std::ofstream &ofs, const std::unique_ptr<CoderConte
   buffer_size += WEIGHT_BUF_SIZE;
   buffer_size = UP_ROUND(buffer_size,4);

-  micro_model->inputs.handle_list = (MSTensorHandle *)&buf[buffer_size];
+  micro_model->inputs.handle_list = (OH_AI_TensorHandle *)&buf[buffer_size];
   buffer_size +=  GRAPH_INPUTS_SIZE * sizeof(MicroTensor *);
   buffer_size = UP_ROUND(buffer_size,4);
   MicroTensor **input_tensors = (MicroTensor **)micro_model->inputs.handle_list;

-  micro_model->outputs.handle_list = (MSTensorHandle *)&buf[buffer_size];
+  micro_model->outputs.handle_list = (OH_AI_TensorHandle *)&buf[buffer_size];
   buffer_size +=  GRAPH_OUTPUTS_SIZE * sizeof(MicroTensor *);
   buffer_size = UP_ROUND(buffer_size,4);
   MicroTensor **output_tensors = (MicroTensor **)micro_model->outputs.handle_list;
@@ -215,7 +239,7 @@ void CodeCortexSetWorkspace(std::ofstream &ofs, const std::unique_ptr<CoderConte
   auto array_tostring = [&ofs](Tensor *tensor, const std::string &prefix, size_t index) {
     ofs << kAlignedString << prefix << "_tensors[" << index << "]->type = " << EnumNameMSDataType(tensor->data_type())
         << ";\n";
-    ofs << kAlignedString << prefix << "_tensors[" << index << "]->format = kMSFormatNHWC;\n";
+    ofs << kAlignedString << prefix << "_tensors[" << index << "]->format = OH_AI_FORMAT_NHWC;\n";
     ofs << kAlignedString << prefix << "_tensors[" << index << "]->ndim = " << tensor->shape().size() << ";\n";
     size_t shape_size = tensor->shape().size();
     for (size_t i = 0; i < shape_size; i++) {
@@ -234,32 +258,31 @@ void CodeCortexSetWorkspace(std::ofstream &ofs, const std::unique_ptr<CoderConte
   ofs << "}\n";
 }

-void CodeMSTensorHandleArrayDestroyState(std::ofstream &ofs, const Configurator &config) {
-  if (config.target() != kCortex_M) {
-    ofs << handle_array_destroy_state;
-  }
+void CodeMSModelCreateDefault(std::ofstream &ofs) {
+  ofs << "OH_AI_ModelHandle OH_AI_ModelCreate() { return model0; }\n";
 }

-void CodeMSModelCreateDefault(std::ofstream &ofs) { ofs << "MSModelHandle MSModelCreate() { return model0; }\n"; }
-
 void CodeMSModelCreate(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config) {
   if (config.target() != kCortex_M) {
-    ofs << "MSStatus MSModelCreate" << ctx->GetCurModelIndex() << "(MicroModel *micro_model) {";
+    ofs << "OH_AI_Status OH_AI_ModelCreate" << ctx->GetCurModelIndex() << "(MicroModel *micro_model) {";
     ofs << R"RAW(
   if (micro_model == NULL) {
-    return kMSStatusLiteNullptr;
-  }
-
-  void *runtime_buffer = GlobalMemory();
-  if (runtime_buffer == NULL) {
-    return kMSStatusLiteNullptr;
+    return OH_AI_STATUS_LITE_NULLPTR;
   }
-  micro_model->runtime_buffer = runtime_buffer;
 )RAW";
-    ofs << "  int ret = SetBuffer" << ctx->GetCurModelIndex() << "(((MemBlock *)runtime_buffer)->addr);\n"
-        << "  if (ret != kMSStatusSuccess) {\n"
-        << "    return kMSStatusLiteMemoryFailed;\n"
-        << "  }\n\n";
+    if (!config.dynamic_shape()) {
+      ofs << "void *runtime_buffer = GlobalMemory();\n"
+          << "if (runtime_buffer == NULL) {\n"
+          << "    return OH_AI_STATUS_LITE_NULLPTR;\n"
+          << "  }\n"
+          << "  micro_model->runtime_buffer = runtime_buffer;\n";
+      ofs << "  int ret = SetBuffer" << ctx->GetCurModelIndex() << "(((MemBlock *)runtime_buffer)->addr);\n"
+          << "  if (ret != OH_AI_STATUS_SUCCESS) {\n"
+          << "    return OH_AI_STATUS_LITE_MEMORY_FAILED;\n"
+          << "  }\n\n";
+    } else {
+      ofs << "  micro_model->runtime_buffer = NULL;\n";
+    }
     if (config.code_mode() == CodeMode::Inference) {
       ofs << "  micro_model->train_mode = false;\n";
     } else if (config.code_mode() == CodeMode::Train) {
@@ -269,7 +292,7 @@ void CodeMSModelCreate(std::ofstream &ofs, const std::unique_ptr<CoderContext> &
       ofs << kAlignedString << prefix << "_tensors[" << index << "] = malloc(sizeof(MicroTensor));\n";
       ofs << kAlignedString << prefix << "_tensors[" << index << "]->type = " << EnumNameMSDataType(tensor->data_type())
           << ";\n";
-      ofs << kAlignedString << prefix << "_tensors[" << index << "]->format = kMSFormatNHWC;\n";
+      ofs << kAlignedString << prefix << "_tensors[" << index << "]->format = OH_AI_FORMAT_NHWC;\n";
       ofs << kAlignedString << prefix << "_tensors[" << index << "]->ndim = " << tensor->shape().size() << ";\n";
       size_t shape_size = tensor->shape().size();
       ofs << kAlignedString << prefix << "_tensors[" << index << "]->shape = "
@@ -289,30 +312,30 @@ void CodeMSModelCreate(std::ofstream &ofs, const std::unique_ptr<CoderContext> &
       outputs = ctx->graph_train_outputs();
     }
     size_t inputs_size = inputs.size();
-    ofs << "  MSTensorHandleArray model_inputs;\n";
+    ofs << "  OH_AI_TensorHandleArray model_inputs;\n";
     ofs << "  model_inputs.handle_num = " << inputs_size << ";\n";
     ofs << "  MicroTensor **input_tensors = malloc(" << inputs_size << " * sizeof(MicroTensor *));\n";
-    ofs << "  model_inputs.handle_list = (MSTensorHandle *)(input_tensors);\n";
+    ofs << "  model_inputs.handle_list = (OH_AI_TensorHandle *)(input_tensors);\n";
     ofs << "  micro_model->inputs = model_inputs;\n";
     for (size_t i = 0; i < inputs_size; ++i) {
       Tensor *input = inputs[i];
       array_tostring(input, "input", i);
     }
     size_t outputs_size = outputs.size();
-    ofs << "  MSTensorHandleArray model_outputs;\n";
+    ofs << "  OH_AI_TensorHandleArray model_outputs;\n";
     ofs << "  model_outputs.handle_num = " << outputs_size << ";\n";
     ofs << "  MicroTensor **output_tensors = malloc(" << outputs_size << " * sizeof(MicroTensor *));\n";
-    ofs << "  model_outputs.handle_list = (MSTensorHandle *)(output_tensors);\n";
+    ofs << "  model_outputs.handle_list = (OH_AI_TensorHandle *)(output_tensors);\n";
     ofs << "  micro_model->outputs = model_outputs;\n";
     for (size_t i = 0; i < outputs_size; ++i) {
       Tensor *output = outputs[i];
       array_tostring(output, "output", i);
     }
-    ofs << "  return kMSStatusSuccess;\n";
+    ofs << "  return OH_AI_STATUS_SUCCESS;\n";
   } else {
-    ofs << "MSStatus MSModelCreate" << ctx->GetCurModelIndex() << "(MicroModel *micro_model) {\n";
+    ofs << "OH_AI_Status OH_AI_ModelCreate" << ctx->GetCurModelIndex() << "(MicroModel *micro_model) {\n";
     ofs << "  micro_model->train_mode = false;\n";
-    ofs << "  return kMSStatusSuccess;\n";
+    ofs << "  return OH_AI_STATUS_SUCCESS;\n";
   }
   ofs << "}\n\n";
 }
@@ -324,20 +347,20 @@ void CodeMSModelBuildCommon(std::ofstream &ofs, const Configurator &config) {
   ofs << R"RAW(
   MicroModel *micro_model = (MicroModel *)model;
   if (micro_model == NULL) {
-    return kMSStatusLiteNullptr;
+    return OH_AI_STATUS_LITE_NULLPTR;
   }
   if (micro_model->build == NULL) {
-    return kMSStatusLiteNullptr;
+    return OH_AI_STATUS_LITE_NULLPTR;
   }
 )RAW";
-  if (config.target() != kCortex_M) {
+  if (config.target() != kCortex_M && !config.dynamic_shape()) {
     ofs << "  IncRefCount();\n";
   }
   ofs << R"RAW(
-  MSStatus ret =
+  OH_AI_Status ret =
     micro_model->build(model, model_data, data_size, model_context);
-  if (ret != kMSStatusSuccess) {
-    MSModelDestroy(model);
+  if (ret != OH_AI_STATUS_SUCCESS) {
+    OH_AI_ModelDestroy(&model);
   }
   return ret;
 }
@@ -345,23 +368,23 @@ void CodeMSModelBuildCommon(std::ofstream &ofs, const Configurator &config) {
 }

 void CodeMSModelBuild(std::ofstream &ofs, const int model_index, const size_t weight_size, const Configurator &config) {
-  ofs << "MSStatus MSModelBuild" << model_index
-      << "(MSModelHandle model, const void *model_data, size_t data_size,\n"
-         "                      const MSContextHandle model_context) {\n"
+  ofs << "OH_AI_Status OH_AI_ModelBuild" << model_index
+      << "(OH_AI_ModelHandle model, const void *model_data, size_t data_size,\n"
+         "                      const OH_AI_ContextHandle model_context) {\n"
          "  if (model == NULL) {\n"
-         "    return kMSStatusLiteParamInvalid;\n"
+         "    return OH_AI_STATUS_LITE_PARAM_INVALID;\n"
          "  }\n";
   if (config.changeable_weights_name().empty()) {
     ofs << "  if (data_size != " << weight_size
         << ") {\n"
-           "    return kMSStatusLiteInputParamInvalid;\n"
+           "    return OH_AI_STATUS_LITE_INPUT_PARAM_INVALID;\n"
            "  }\n";
   }
   ofs << "  MicroModel *micro_model = (MicroModel *)model;\n"
-         "  int ret = MSModelCreate"
+         "  int ret = OH_AI_ModelCreate"
       << model_index
       << "(micro_model);\n"
-         "  if (ret != kMSStatusSuccess) {\n"
+         "  if (ret != OH_AI_STATUS_SUCCESS) {\n"
          "    return ret;\n"
          "  }\n";
   if (config.target() != kCortex_M) {
@@ -372,7 +395,7 @@ void CodeMSModelBuild(std::ofstream &ofs, const int model_index, const size_t we
   if (config.support_parallel()) {
     ofs << "  MicroContext *micro_context = (MicroContext *)model_context;\n"
            "  if (micro_context == NULL) {\n"
-           "      return kMSStatusLiteNullptr;"
+           "      return OH_AI_STATUS_LITE_NULLPTR;"
            "  }\n"
            "  ret = CreateThreadPool(micro_context->thread_num_);\n"
            "  if(ret != RET_OK) {\n"
@@ -384,35 +407,172 @@ void CodeMSModelBuild(std::ofstream &ofs, const int model_index, const size_t we
   ofs << "}\n";
 }

+void CodeMSModelResizeInit(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config) {
+  auto &dynamic_symbols_num = config.dynamic_symbols_num();
+  std::string array_index;
+  for (auto num : dynamic_symbols_num) {
+    array_index += "[" + std::to_string(num) + "]";
+  }
+  auto shapes = ctx->shape_all_scenes();
+  if (!shapes.empty()) {
+    auto num_of_each_scene = shapes.begin()->second.size();
+    ofs << "  static int shapes" << array_index << "[" + std::to_string(num_of_each_scene) + "] = {";
+    for (auto &item : shapes) {
+      auto &shape_val = item.second;
+      for (size_t j = 0; j < shape_val.size(); ++j) {
+        ofs << shape_val[j] << ", ";
+      }
+    }
+    ofs << "};\n";
+  }
+  auto offsets = ctx->offset_all_scenes();
+  if (!offsets.empty()) {
+    auto num_of_each_scene = offsets.begin()->second.size();
+    ofs << "  static int offsets" << array_index << "[" + std::to_string(num_of_each_scene) + "] = {";
+    for (auto &item : offsets) {
+      auto &offset_val = item.second;
+      for (size_t j = 0; j < offset_val.size(); ++j) {
+        ofs << offset_val[j] << ", ";
+      }
+    }
+    ofs << "};\n";
+  }
+  ofs << "  size_t buffer_sizes" << array_index << " = {";
+  auto buffer_size = ctx->buffer_sizes();
+  auto workspace = ctx->workspaces();
+  if (buffer_size.size() != workspace.size()) {
+    return;
+  }
+  for (size_t i = 0; i < buffer_size.size(); i++) {
+    ofs << buffer_size[i] + workspace[i] << ", ";
+  }
+  ofs << "};\n";
+}
+
+void CodeMSModelResize(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config) {
+  auto &shape_templates = ctx->shape_templates();
+  ofs << "OH_AI_Status OH_AI_ModelResize" << ctx->GetCurModelIndex()
+      << "(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs, OH_AI_ShapeInfo *shape_infos, size_t "
+         "shape_info_num) {\n"
+         "  if (model == NULL) {\n"
+         "    return OH_AI_STATUS_LITE_PARAM_INVALID;\n"
+         "  }\n";
+  if (!config.dynamic_shape()) {
+    ofs << "  return OH_AI_STATUS_LITE_NOT_SUPPORT;\n";
+  } else {
+    ofs << "  MicroModel *micro_model = (MicroModel *)model;\n"
+        << "  if (micro_model == NULL) {\n"
+           "    return OH_AI_STATUS_LITE_NULLPTR;\n"
+           "  }\n";
+    CodeMSModelResizeInit(ofs, ctx, config);
+    std::map<std::string, std::vector<int>> symbol_to_indexes;
+    std::map<std::string, std::string> user_to_inner;
+    auto &user_graph_inputs_template = config.user_graph_inputs_template();
+    for (size_t i = 0; i < ctx->graph_inputs().size(); ++i) {
+      auto cur_tensor = ctx->graph_inputs()[i];
+      auto cur_shapes = shape_templates.at(cur_tensor);
+      for (size_t j = 0; j < cur_shapes.size(); ++j) {
+        if (IsNumber(cur_shapes.at(j))) {
+          continue;
+        }
+        ofs << "  if (shape_infos[" << i << "].shape[" << j << "] <= 0) {\n"
+            << "    return OH_AI_STATUS_LITE_PARAM_INVALID;\n"
+            << "  }\n";
+        ofs << "  ((MicroTensor *)(inputs.handle_list[" << i << "]))->shape[" << j << "] = shape_infos[" << i
+            << "].shape[" << j << "];\n";
+        if (symbol_to_indexes.find(cur_shapes.at(j)) != symbol_to_indexes.end()) {
+          continue;
+        }
+        symbol_to_indexes[cur_shapes.at(j)] = {static_cast<int>(i), static_cast<int>(j)};
+        user_to_inner[user_graph_inputs_template[i][j]] = cur_shapes.at(j);
+      }
+    }
+    int index = 0;
+    std::map<std::string, std::string> inner_to_outer;
+    for (auto &item : symbol_to_indexes) {
+      ofs << "  int dim" << index << " = shape_infos[" << item.second[0] << "].shape[" << item.second[1] << "];\n";
+      inner_to_outer[item.first] = "dim" + std::to_string(index);
+      ++index;
+    }
+    std::string condition;
+    index = 0;
+    for (; index < static_cast<int>(symbol_to_indexes.size()) - 1; ++index) {
+      condition += "store" + std::to_string(ctx->GetCurModelIndex()) + "_" + std::to_string(index) + " == dim" +
+                   std::to_string(index) + " && ";
+    }
+    condition += "store" + std::to_string(ctx->GetCurModelIndex()) + "_" + std::to_string(index) + " == dim" +
+                 std::to_string(index);
+    ofs << "  if (" << condition << ") {\n"
+        << "    return OH_AI_STATUS_SUCCESS;\n"
+        << "  }\n";
+    for (size_t i = 0; i < symbol_to_indexes.size(); ++i) {
+      ofs << "  store" + std::to_string(ctx->GetCurModelIndex()) + "_" << i << " = dim" << i << ";\n";
+    }
+    ofs << "  if (" << kBufferPrefixName << " != NULL) {\n";
+    ofs << "    free(" << kBufferPrefixName << ");\n";
+    ofs << "  }\n";
+    std::string real_array_index;
+    auto &dynamic_symbols = config.dynamic_symbols();
+    for (auto &symbol : dynamic_symbols) {
+      real_array_index += "[" + inner_to_outer[user_to_inner[symbol]] + " - 1]";
+    }
+    ofs << "  " << kBufferPrefixName << " = malloc(buffer_sizes" << real_array_index << ");\n";
+    ofs << "  micro_model->runtime_buffer = " << kBufferPrefixName << ";\n";
+    ofs << "  " << kShapePrefixName << " = &shapes" << real_array_index << "[0];\n";
+    ofs << "  " << kOffsetPrefixName << " = &offsets" << real_array_index << "[0];\n";
+    ofs << "  OH_AI_TensorHandleArray outputs = OH_AI_ModelGetOutputs(model);\n";
+    for (size_t i = 0; i < ctx->graph_outputs().size(); ++i) {
+      ofs << "  OH_AI_TensorSetData(outputs.handle_list[" << i << "], NULL);\n";
+      auto cur_tensor = ctx->graph_outputs()[i];
+      auto cur_shapes = shape_templates.at(cur_tensor);
+      for (size_t j = 0; j < cur_shapes.size(); ++j) {
+        if (IsNumber(cur_shapes.at(j))) {
+          continue;
+        }
+        ofs << "  ((MicroTensor *)(outputs.handle_list[" << i << "]))->shape[" << j << "] = " << cur_shapes.at(j)
+            << ";\n";
+      }
+    }
+    ofs << "  return OH_AI_STATUS_SUCCESS;\n";
+  }
+  ofs << "}\n";
+}
+
 void CodeMSModelDestory(std::ofstream &ofs, const Configurator *config) {
-  if (config->target() != kCortex_M) {
+  if (config->code_mode() == CodeMode::Inference && config->target() != kCortex_M) {
     ofs << handle_array_destroy;
   }
-  ofs << "void MSModelDestroy(MSModelHandle *model) {\n";
+  ofs << "void OH_AI_ModelDestroy(OH_AI_ModelHandle *model) {\n";
+  ofs << "  if (*model) {\n"
+         "    MicroModel *micro_model = (MicroModel *)*model;\n";
   if (config->target() != kCortex_M) {
-    ofs << "  if (*model) {\n"
-           "    MicroModel *micro_model = (MicroModel *)*model;\n";
-    ofs << "    if (micro_model->runtime_buffer) {\n"
-           "      micro_model->runtime_buffer = NULL;\n"
-           "    }\n";
-    ofs << "    MSTensorHandleArrayDestroy(micro_model->inputs);\n"
-           "    MSTensorHandleArrayDestroy(micro_model->outputs);\n"
-           "    micro_model->inputs.handle_list = NULL;\n"
+    ofs << "    if (micro_model->runtime_buffer) {\n";
+    if (config->dynamic_shape()) {
+      ofs << "      free(micro_model->runtime_buffer);\n";
+    } else {
+      ofs << "      micro_model->runtime_buffer = NULL;\n";
+    }
+    ofs << "    }\n";
+  }
+  ofs << "    OH_AI_TensorHandleArrayDestroy(micro_model->inputs);\n"
+         "    OH_AI_TensorHandleArrayDestroy(micro_model->outputs);\n";
+  if (config->code_mode() == CodeMode::Inference) {
+    ofs << "    micro_model->inputs.handle_list = NULL;\n"
            "    micro_model->outputs.handle_list = NULL;\n"
-           "    micro_model->free_resource();\n"
-           "    DecRefCount();\n"
-           "  }\n";
-
-    if (config->support_parallel()) {
-      ofs << "  ClearThreadPool();\n";
+           "    micro_model->free_resource();\n";
+    if (!config->dynamic_shape()) {
+      ofs << "    DecRefCount();\n";
     }
+    ofs << "  }\n";
   } else {
-    ofs << "  if (*model) {\n"
-           "    MicroModel *micro_model = (MicroModel *)*model;\n";
-    ofs << "    micro_model->runtime_buffer = NULL;\n"
+    ofs << "    free(*model);\n"
            "    *model = NULL;\n"
            "  }\n";
   }
+
+  if (config->support_parallel()) {
+    ofs << "  ClearThreadPool();\n";
+  }
   ofs << "}\n";
 }

@@ -420,14 +580,14 @@ void CodeMSModelPredictState(std::ofstream &ofs) { ofs << micro_model_predict_st

 void CodeMSModelPredictCommon(std::ofstream &ofs) {
   ofs << R"RAW(
-MSStatus MSModelPredict(MSModelHandle model, const MSTensorHandleArray inputs, MSTensorHandleArray *outputs,
-                        const MSKernelCallBackC before, const MSKernelCallBackC after) {
+OH_AI_Status OH_AI_ModelPredict(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs, OH_AI_TensorHandleArray *outputs,
+                        const OH_AI_KernelCallBack before, const OH_AI_KernelCallBack after) {
   MicroModel *micro_model = (MicroModel *)model;
   if (micro_model == NULL) {
-    return kMSStatusLiteNullptr;
+    return OH_AI_STATUS_LITE_NULLPTR;
   }
   if (micro_model->predict == NULL) {
-    return kMSStatusLiteNullptr;
+    return OH_AI_STATUS_LITE_NULLPTR;
   }
   return micro_model->predict(model, inputs, outputs, before, after);
 }
@@ -438,35 +598,35 @@ MSStatus MSModelPredict(MSModelHandle model, const MSTensorHandleArray inputs, M
 void CodeMSModelPredict(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config) {
   auto inputs_num = ctx->graph_inputs().size();
   auto outputs_num = ctx->graph_outputs().size();
-  ofs << "MSStatus MSModelPredict" << ctx->GetCurModelIndex()
-      << "(MSModelHandle model, const MSTensorHandleArray inputs, MSTensorHandleArray *outputs,\n"
-      << "                         const MSKernelCallBackC before, const MSKernelCallBackC after) {\n";
+  ofs << "OH_AI_Status OH_AI_ModelPredict" << ctx->GetCurModelIndex()
+      << "(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs, OH_AI_TensorHandleArray *outputs,\n"
+      << "                         const OH_AI_KernelCallBack before, const OH_AI_KernelCallBack after) {\n";
   ofs << R"RAW(
   MicroModel *micro_model = (MicroModel *)model;
   if (micro_model == NULL) {
-    return kMSStatusLiteNullptr;
+    return OH_AI_STATUS_LITE_NULLPTR;
   }
   if (micro_model->runtime_buffer == NULL) {
-    return kMSStatusLiteMemoryFailed;
+    return OH_AI_STATUS_LITE_MEMORY_FAILED;
   }
 )RAW";
   ofs << "  if (inputs.handle_num != " << inputs_num << ") {\n";
-  ofs << "    return kMSStatusLiteParamInvalid;\n";
+  ofs << "    return OH_AI_STATUS_LITE_PARAM_INVALID;\n";
   ofs << "  }\n";
   ofs << "  if (outputs->handle_num != " << outputs_num << ") {\n";
-  ofs << "    return kMSStatusLiteParamInvalid;\n";
+  ofs << "    return OH_AI_STATUS_LITE_PARAM_INVALID;\n";
   ofs << "  }\n";
-  if (config.target() != kCortex_M) {
+  if (config.target() != kCortex_M && !config.dynamic_shape()) {
     ofs << "  if (!LockBuffer(micro_model->runtime_buffer)) {\n"
         << "    void *buffer = Malloc(GetBufferSize" << ctx->GetCurModelIndex() << "());\n"
         << "    if (buffer == NULL) {\n"
-        << "      return kMSStatusLiteNullptr;\n"
+        << "      return OH_AI_STATUS_LITE_NULLPTR;\n"
         << "    }\n"
         << "    if (micro_model->runtime_buffer != buffer) {\n"
         << "      micro_model->runtime_buffer = buffer;\n"
         << "      int ret = SetBuffer" << ctx->GetCurModelIndex() << "(((MemBlock *)buffer)->addr);\n"
-        << "      if (ret != kMSStatusSuccess) {\n"
-        << "        return kMSStatusLiteMemoryFailed;\n"
+        << "      if (ret != OH_AI_STATUS_SUCCESS) {\n"
+        << "        return OH_AI_STATUS_LITE_MEMORY_FAILED;\n"
         << "      }\n"
         << "    }\n"
         << "  }\n";
@@ -495,8 +655,7 @@ void CodeMSModelPredict(std::ofstream &ofs, const std::unique_ptr<CoderContext>
   ofs << "    }\n";
   ofs << "  }\n";
   ofs << "\n";
-  ofs << "  void *outputs_data_array[" << outputs_num << "];\n";
-  ofs << "  int expect_out_types[" << outputs_num << "] = {";
+  ofs << "  int cur_out_types[" << outputs_num << "] = {";
   for (size_t i = 0; i < outputs_num; ++i) {
     ofs << ctx->graph_outputs().at(i)->data_type() << ", ";
   }
@@ -506,21 +665,18 @@ void CodeMSModelPredict(std::ofstream &ofs, const std::unique_ptr<CoderContext>
     ofs << "false, ";
   }
   ofs << "};\n";
-  ofs << "  for (int i = 0; i < " << outputs_num << "; i++) {\n";
-  ofs << "    outputs_data_array[i] = MSTensorGetMutableData(outputs->handle_list[i]);\n";
-  ofs << "  }\n";
-  ofs << "  CopyOutputsData" << ctx->GetCurModelIndex()
-      << "(outputs, outputs_data_array, expect_out_types, out_type_changed);\n";
-  if (config.target() != kCortex_M) {
+  ofs << "  OH_AI_Status ret = CopyOutputsData" << ctx->GetCurModelIndex()
+      << "(outputs, cur_out_types, out_type_changed);\n";
+  if (config.target() != kCortex_M && !config.dynamic_shape()) {
     ofs << "  UnLockBuffer(micro_model->runtime_buffer);\n";
   }
-  ofs << "  return kMSStatusSuccess;\n";
+  ofs << "  return ret;\n";
   ofs << "}\n";
 }

 void CodeCopyOutputsState(std::ofstream &ofs, const int model_index) {
-  ofs << "int CopyOutputsData" << model_index
-      << "(MSTensorHandleArray *outputs_ori, void **outputs, int *expect_types, bool *type_changed);\n\n";
+  ofs << "OH_AI_Status CopyOutputsData" << model_index
+      << "(OH_AI_TensorHandleArray *outputs_ori, void **outputs, int *cur_out_types, bool *type_changed);\n\n";
 }

 void CodeCopyOutputsImplement(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx) {
@@ -528,56 +684,60 @@ void CodeCopyOutputsImplement(std::ofstream &ofs, const std::unique_ptr<CoderCon
   std::vector<Tensor *> outputs = ctx->graph_outputs();
   size_t outputs_size = outputs.size();

-  ofs << "int CopyOutputsData" << ctx->GetCurModelIndex()
-      << "(MSTensorHandleArray *outputs_ori, void **outputs, int *expect_types, bool *type_changed) {\n"
-         "  if (outputs_ori == NULL || outputs == NULL) {\n"
-         "    return RET_ERROR;\n"
+  ofs << "OH_AI_Status CopyOutputsData" << ctx->GetCurModelIndex()
+      << "(OH_AI_TensorHandleArray *outputs_ori, int *cur_out_types, bool *type_changed) {\n"
+         "  if (outputs_ori == NULL || cur_out_types == NULL || type_changed == NULL) {\n"
+         "    return OH_AI_STATUS_LITE_NULLPTR;\n"
          "  }\n";
   ofs << "  unsigned char *buffer[" << outputs_size << "] = {";
   for (size_t i = 0; i < outputs_size; ++i) {
-    ofs << tensor_map[outputs[i]] << ", ";
-  }
-  ofs << "};\n";
-  ofs << "  size_t buffer_size[" << outputs_size << "] = {";
-  for (size_t i = 0; i < outputs_size; ++i) {
-    Tensor *output = outputs[i];
-    MS_CHECK_PTR_IF_NULL(output);
-    ofs << output->Size() << ", ";
+    auto out_str = ctx->tensor_addr(outputs[i]);
+    if (out_str.empty()) {
+      ofs << tensor_map[outputs[i]] << ", ";
+    } else {
+      ofs << out_str << ", ";
+    }
   }
   ofs << "};\n";
   ofs << "  for (int i = 0; i < " << outputs_size << "; i++) {\n"
       << "    MicroTensor *micro_tensor = (MicroTensor *)outputs_ori->handle_list[i];\n"
-      << "    int cur_type = micro_tensor->type;\n"
-      << "    int expect_type = expect_types[i];\n";
-  ofs << "    if (cur_type == expect_type) {\n"
-      << "      memcpy(outputs[i], buffer[i], buffer_size[i]);\n"
+      << "    int expect_type = micro_tensor->type;\n"
+      << "    int cur_type = cur_out_types[i];\n";
+  ofs << "    if (expect_type == cur_type) {\n"
+      << "      micro_tensor->data = buffer[i];\n"
+      << "      micro_tensor->owned = false;\n"
       << "      continue;\n"
       << "    }\n"
+      << "#ifdef ENABLE_FP16\n"
       << "    int shape_size = micro_tensor->ndim;\n"
       << "    int num = 1;\n"
-      << "    for (int i = 0; i < shape_size; ++i) {\n"
-      << "      num *= micro_tensor->shape[i];\n"
+      << "    for (int j = 0; j < shape_size; ++j) {\n"
+      << "      num *= micro_tensor->shape[j];\n"
       << "    }\n";
-  ofs << "    int type_trans_mode = TypeTransMode_MAX;\n"
-         "    if (expect_type == kMSDataTypeNumberTypeFloat16 && cur_type == kMSDataTypeNumberTypeFloat32) {\n"
-         "      type_trans_mode = TypeTransMode_FP32_TO_FP16;\n"
-         "    } else if (expect_type == kMSDataTypeNumberTypeFloat32 && cur_type == kMSDataTypeNumberTypeFloat16) {\n"
-         "      type_trans_mode = TypeTransMode_FP16_TO_FP32;\n"
-         "    }\n";
+  ofs
+    << "    int type_trans_mode = TypeTransMode_MAX;\n"
+       "    if (expect_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT16 && cur_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT32) {\n"
+       "      type_trans_mode = TypeTransMode_FP32_TO_FP16;\n"
+       "    } else if (expect_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT32 && cur_type == "
+       "OH_AI_DATATYPE_NUMBERTYPE_FLOAT16) {\n"
+       "      type_trans_mode = TypeTransMode_FP16_TO_FP32;\n"
+       "    }\n";
   ofs << "    if (type_trans_mode == TypeTransMode_UNSUPPORT) {\n"
-      << "      return kMSStatusLiteNotSupport;\n"
+      << "      return OH_AI_STATUS_LITE_NOT_SUPPORT;\n"
       << "    }\n";
-  ofs << "#ifdef ENABLE_FP16\n"
-      << "    if (type_trans_mode == TypeTransMode_FP32_TO_FP16) {\n"
-      << "      Fp32CastToFp16((float *)(buffer[i]), (float16_t *)&outputs, num);\n"
+  ofs << "    void *out_data = OH_AI_TensorGetMutableData(micro_tensor);\n";
+  ofs << "    if (type_trans_mode == TypeTransMode_FP32_TO_FP16) {\n"
+      << "      Fp32CastToFp16((float *)(buffer[i]), (float16_t *)out_data, num);\n"
       << "      type_changed[i] = true;\n"
       << "    } else if (type_trans_mode == TypeTransMode_FP16_TO_FP32) {\n"
-      << "      Fp16CastToFp32((float16_t *)&outputs, (float *)(buffer[i]), num);\n"
+      << "      Fp16CastToFp32((float16_t *)(buffer[i]), (float *)out_data, num);\n"
       << "      type_changed[i] = true;\n"
       << "    }\n"
+      << "#else\n"
+      << "    return OH_AI_STATUS_LITE_NOT_SUPPORT;\n"
       << "#endif\n"
       << "  }\n";
-  ofs << "  return RET_OK;\n"
+  ofs << "  return OH_AI_STATUS_SUCCESS;\n"
          "}\n\n";
 }

@@ -688,6 +848,16 @@ void CodeInitResourceImplement(std::ofstream &ofs, const std::unique_ptr<CoderCo
          "}\n";
 }

+void CodeResetImplement(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config) {
+  ofs << "void Reset" << ctx->GetCurModelIndex() << "() {\n";
+  auto &dynamic_symbols = config.dynamic_symbols();
+  for (size_t i = 0; i < dynamic_symbols.size(); ++i) {
+    ofs << "  store" << ctx->GetCurModelIndex() << "_" << i << " = -1;\n";
+  }
+  ofs << "  FreeResource" << ctx->GetCurModelIndex() << "();\n";
+  ofs << "}\n";
+}
+
 void CodeFreeResourceState(std::ofstream &ofs) { ofs << free_resource_state; }

 void CodeFreeResourceImplement(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx,
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.h b/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.h
index 56209f05..6f0c7736 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.h
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.h
@@ -32,12 +32,13 @@ void CodeMSModelCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr<Code
 void CodeCortexCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx);
 void CodeMSModelSetWorkspace(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config);
 void CodeCortexSetWorkspace(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx);
-void CodeMSTensorHandleArrayDestroyState(std::ofstream &ofs, const Configurator &config);
 void CodeMSModelCreateDefault(std::ofstream &ofs);
 void CodeMSModelCreate(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config);
 void CodeMSModelBuildState(std::ofstream &ofs);
 void CodeMSModelBuildCommon(std::ofstream &ofs, const Configurator &config);
 void CodeMSModelBuild(std::ofstream &ofs, const int model_index, const size_t weight_size, const Configurator &config);
+void CodeMSModelResizeInit(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config);
+void CodeMSModelResize(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config);
 void CodeMSModelDestory(std::ofstream &ofs, const Configurator *config);
 void CodeMSModelPredictState(std::ofstream &ofs);
 void CodeMSModelPredictCommon(std::ofstream &ofs);
@@ -57,6 +58,7 @@ void CodeGraphQuantArgsImplement(std::ofstream &ofs, const std::unique_ptr<Coder
 void CodeManageResourceState(std::ofstream &ofs, const int model_index);
 void CodeInitResourceImplement(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx);

+void CodeResetImplement(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx, const Configurator &config);
 void CodeFreeResourceState(std::ofstream &ofs);
 void CodeFreeResourceImplement(std::ofstream &ofs, const std::unique_ptr<CoderContext> &ctx,
                                const Configurator &config);
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/component.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/component.cc
index b2ed21be..0ee02e0c 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/component.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/component.cc
@@ -24,6 +24,8 @@ const char *kOutputPrefixName = nullptr;
 const char *kWeightPrefixName = nullptr;
 const char *kBufferPrefixName = nullptr;
 const char *kBufferPrefixNameAdd = nullptr;
+const char *kOffsetPrefixName = nullptr;
+const char *kShapePrefixName = nullptr;

 char *ModifyPrefixName(char *name, int model_index, const std::string &prefix) {
   if (name != nullptr) {
@@ -57,6 +59,8 @@ void FreeGlobalVariable() {
   Free(kWeightPrefixName);
   Free(kBufferPrefixName);
   Free(kBufferPrefixNameAdd);
+  Free(kOffsetPrefixName);
+  Free(kShapePrefixName)
 }

 void InitGlobalVariable(int model_index) {
@@ -65,5 +69,7 @@ void InitGlobalVariable(int model_index) {
   kWeightPrefixName = ModifyPrefixName(const_cast<char *>(kWeightPrefixName), model_index, "_weight");
   kBufferPrefixName = ModifyPrefixName(const_cast<char *>(kBufferPrefixName), model_index, "_buffer");
   kBufferPrefixNameAdd = ModifyPrefixName(const_cast<char *>(kBufferPrefixNameAdd), model_index, "_buffer + ");
+  kOffsetPrefixName = ModifyPrefixName(const_cast<char *>(kOffsetPrefixName), model_index, "_offset");
+  kShapePrefixName = ModifyPrefixName(const_cast<char *>(kShapePrefixName), model_index, "_shape");
 }
 }  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/component.h b/mindspore/lite/tools/converter/micro/coder/generator/component/component.h
index 0e943317..e084d692 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/component.h
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/component.h
@@ -16,7 +16,6 @@

 #ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_GENERATOR_COMPONENT_COMPONENT_H_
 #define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_GENERATOR_COMPONENT_COMPONENT_H_
-#include <string>

 namespace mindspore::lite::micro {
 extern const char *kInputPrefixName;
@@ -26,6 +25,8 @@ constexpr auto kPackWeightOffsetName = "w_offset";
 constexpr auto kPackWeightSizeName = "w_size";
 extern const char *kBufferPrefixName;
 extern const char *kBufferPrefixNameAdd;
+extern const char *kOffsetPrefixName;
+extern const char *kShapePrefixName;
 void FreeGlobalVariable();
 void InitGlobalVariable(int model_index);

diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/benchmark.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/benchmark.cc
index 91f2ca89..ad638276 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/benchmark.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/benchmark.cc
@@ -53,7 +53,7 @@ const char benchmark_source[] = R"RAW(/**

 void usage() {
   printf(
-    "-- mindspore benchmark params usage:\n"
+    "-- mindspore benchmark paraOH_AI_ usage:\n"
     "args[0]: executable file\n"
     "args[1]: inputs binary file\n"
     "args[2]: model weight binary file\n"
@@ -67,38 +67,38 @@ void usage() {

 uint64_t GetTimeUs() {
   const int USEC = 1000000;
-  const int MSEC = 1000;
+  const int OH_AI_EC = 1000;
   struct timespec ts = {0, 0};
   if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
     return 0;
   }
-  uint64_t retval = (uint64_t)((ts.tv_sec * USEC) + (ts.tv_nsec / MSEC));
+  uint64_t retval = (uint64_t)((ts.tv_sec * USEC) + (ts.tv_nsec / OH_AI_EC));
   return retval;
 }

-void PrintTensorHandle(MSTensorHandle tensor) {
-  printf("name: %s, ", MSTensorGetName(tensor));
-  MSDataType data_type = MSTensorGetDataType(tensor);
+void PrintTensorHandle(OH_AI_TensorHandle tensor) {
+  printf("name: %s, ", OH_AI_TensorGetName(tensor));
+  OH_AI_DataType data_type = OH_AI_TensorGetDataType(tensor);
   printf("DataType: %d, ", data_type);
-  size_t element_num = (size_t)(MSTensorGetElementNum(tensor));
+  size_t element_num = (size_t)(OH_AI_TensorGetElementNum(tensor));
   printf("Elements: %zu, ", element_num);
   printf("Shape: [");
   size_t shape_num = 0;
-  const int64_t *dims = MSTensorGetShape(tensor, &shape_num);
+  const int64_t *dims = OH_AI_TensorGetShape(tensor, &shape_num);
   for (size_t i = 0; i < shape_num; i++) {
     printf("%d ", (int)dims[i]);
   }
   printf("], Data: \n");
-  void *data = MSTensorGetMutableData(tensor);
+  void *data = OH_AI_TensorGetMutableData(tensor);
   element_num = element_num > 10 ? 10 : element_num;
   switch (data_type) {
-    case kMSDataTypeNumberTypeFloat32: {
+    case OH_AI_DATATYPE_NUMBERTYPE_FLOAT32: {
       for (size_t i = 0; i < element_num; i++) {
         printf("%.6f, ", ((float *)data)[i]);
       }
       printf("\n");
     } break;
-    case kMSDataTypeNumberTypeFloat16:
+    case OH_AI_DATATYPE_NUMBERTYPE_FLOAT16:
 #ifdef ENABLE_FP16
     {
       for (size_t i = 0; i < element_num; i++) {
@@ -107,25 +107,25 @@ void PrintTensorHandle(MSTensorHandle tensor) {
       printf("\n");
     } break;
 #endif
-    case kMSDataTypeNumberTypeInt16: {
+    case OH_AI_DATATYPE_NUMBERTYPE_INT16: {
       for (size_t i = 0; i < element_num; i++) {
         printf("%" PRId16, ((int16_t *)data)[i]);
       }
       printf("\n");
     } break;
-    case kMSDataTypeNumberTypeInt32: {
+    case OH_AI_DATATYPE_NUMBERTYPE_INT32: {
       for (size_t i = 0; i < element_num; i++) {
         printf("%" PRId32, ((int32_t *)data)[i]);
       }
       printf("\n");
     } break;
-    case kMSDataTypeNumberTypeInt8: {
+    case OH_AI_DATATYPE_NUMBERTYPE_INT8: {
       for (size_t i = 0; i < element_num; i++) {
         printf("%" PRIi8, ((int8_t *)data)[i]);
       }
       printf("\n");
     } break;
-    case kMSDataTypeNumberTypeUInt8: {
+    case OH_AI_DATATYPE_NUMBERTYPE_UINT8: {
       for (size_t i = 0; i < element_num; i++) {
         printf("%u", ((uint8_t *)data)[i]);
       }
@@ -141,31 +141,31 @@ int main(int argc, const char **argv) {
   if (argc < 2) {
     printf("input command is invalid\n");
     usage();
-    return kMSStatusLiteError;
+    return OH_AI_STATUS_LITE_ERROR;
   }
   printf("=======run benchmark======\n");

-  MSContextHandle ms_context_handle = MSContextCreate();
+  OH_AI_ContextHandle ms_context_handle = OH_AI_ContextCreate();
   if (argc >= 6) {
     int thread_num = atoi(argv[5]);
     if (thread_num < 1 || thread_num > kMaxThreadNum) {
       printf("Thread number error! It should be greater than 0 and less than 5\n");
-      return kMSStatusLiteParamInvalid;
+      return OH_AI_STATUS_LITE_PARAM_INVALID;
     }
-    MSContextSetThreadNum(ms_context_handle, thread_num);
+    OH_AI_ContextSetThreadNum(ms_context_handle, thread_num);
   }
-  printf("ThreadNum: %d.\n", MSContextGetThreadNum(ms_context_handle));
+  printf("ThreadNum: %d.\n", OH_AI_ContextGetThreadNum(ms_context_handle));

   int bind_mode = kBindDefault;
   if (argc >= 7) {
     bind_mode = atoi(argv[6]);
     if (bind_mode < 0 || bind_mode > 2) {
       printf("Thread bind mode error! 0: No bind, 1: Bind hign cpu, 2: Bind mid cpu.\n");
-      return kMSStatusLiteParamInvalid;
+      return OH_AI_STATUS_LITE_PARAM_INVALID;
     }
   }
-  MSContextSetThreadAffinityMode(ms_context_handle, bind_mode);
-  printf("BindMode: %d.\n", MSContextGetThreadAffinityMode(ms_context_handle));
+  OH_AI_ContextSetThreadAffinityMode(ms_context_handle, bind_mode);
+  printf("BindMode: %d.\n", OH_AI_ContextGetThreadAffinityMode(ms_context_handle));

   void *model_buffer = NULL;
   int model_size = 0;
@@ -174,14 +174,14 @@ int main(int argc, const char **argv) {
     model_buffer = ReadInputData(argv[2], &model_size);
     if (model_buffer == NULL) {
       printf("Read model file failed.");
-      return kMSStatusLiteParamInvalid;
+      return OH_AI_STATUS_LITE_PARAM_INVALID;
     }
   }
-  MSModelHandle model_handle = MSModelCreate();
-  int ret = MSModelBuild(model_handle, model_buffer, model_size, kMSModelTypeMindIR, ms_context_handle);
-  MSContextDestroy(&ms_context_handle);
-  if (ret != kMSStatusSuccess) {
-    printf("MSModelBuildFromFile failed, ret: %d\n", ret);
+  OH_AI_ModelHandle model_handle = OH_AI_ModelCreate();
+  int ret = OH_AI_ModelBuild(model_handle, model_buffer, model_size, OH_AI_MODELTYPE_MINDIR, ms_context_handle);
+  OH_AI_ContextDestroy(&ms_context_handle);
+  if (ret != OH_AI_STATUS_SUCCESS) {
+    printf("OH_AI_ModelBuild failed, ret: %d\n", ret);
     free(model_buffer);
     model_buffer = NULL;
     return ret;
@@ -191,33 +191,33 @@ int main(int argc, const char **argv) {
     model_buffer = NULL;
   }
   // set model inputs tensor data
-  MSTensorHandleArray inputs_handle = MSModelGetInputs(model_handle);
+  OH_AI_TensorHandleArray inputs_handle = OH_AI_ModelGetInputs(model_handle);
   if (inputs_handle.handle_list == NULL) {
-    printf("MSModelGetInputs failed, ret: %d", ret);
+    printf("OH_AI_ModelGetInputs failed, ret: %d", ret);
     return ret;
   }
   size_t inputs_num = inputs_handle.handle_num;
   void *inputs_binbuf[inputs_num];
   int inputs_size[inputs_num];
   for (size_t i = 0; i < inputs_num; ++i) {
-    MSTensorHandle tensor = inputs_handle.handle_list[i];
-    inputs_size[i] = (int)MSTensorGetDataSize(tensor);
+    OH_AI_TensorHandle tensor = inputs_handle.handle_list[i];
+    inputs_size[i] = (int)OH_AI_TensorGetDataSize(tensor);
   }
   ret = ReadInputsFile((char *)(argv[1]), inputs_binbuf, inputs_size, (int)inputs_num);
   if (ret != 0) {
-    MSModelDestroy(&model_handle);
+    OH_AI_ModelDestroy(&model_handle);
     return ret;
   }
   for (size_t i = 0; i < inputs_num; ++i) {
-    void *input_data = MSTensorGetMutableData(inputs_handle.handle_list[i]);
+    void *input_data = OH_AI_TensorGetMutableData(inputs_handle.handle_list[i]);
     memcpy(input_data, inputs_binbuf[i], inputs_size[i]);
     free(inputs_binbuf[i]);
     inputs_binbuf[i] = NULL;
   }

-  MSTensorHandleArray outputs_handle = MSModelGetOutputs(model_handle);
+  OH_AI_TensorHandleArray outputs_handle = OH_AI_ModelGetOutputs(model_handle);
   if (!outputs_handle.handle_list) {
-    printf("MSModelGetOutputs failed, ret: %d", ret);
+    printf("OH_AI_ModelGetOutputs failed, ret: %d", ret);
     return ret;
   }

@@ -226,15 +226,15 @@ int main(int argc, const char **argv) {
       warm_up_loop_count = atoi(argv[7]);
       if (warm_up_loop_count < 0) {
         printf("The warm up loop count error! Cannot be less than 0.\n");
-        return kMSStatusLiteParamInvalid;
+        return OH_AI_STATUS_LITE_PARAM_INVALID;
       }
   }
   printf("Running warm up loops...");
   for (int i = 0; i < warm_up_loop_count; ++i) {
-    ret = MSModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL);
-    if (ret != kMSStatusSuccess) {
-      MSModelDestroy(&model_handle);
-      printf("MSModelPredict failed, ret: %d", ret);
+    ret = OH_AI_ModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL);
+    if (ret != OH_AI_STATUS_SUCCESS) {
+      OH_AI_ModelDestroy(&model_handle);
+      printf("OH_AI_ModelPredict failed, ret: %d", ret);
       return ret;
     }
   }
@@ -244,10 +244,10 @@ int main(int argc, const char **argv) {
     printf("\nloop count: %d\n", loop_count);
     uint64_t start_time = GetTimeUs();
     for (int i = 0; i < loop_count; ++i) {
-      ret = MSModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL);
-      if (ret != kMSStatusSuccess) {
-        MSModelDestroy(&model_handle);
-        printf("MSModelPredict failed, ret: %d", ret);
+      ret = OH_AI_ModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL);
+      if (ret != OH_AI_STATUS_SUCCESS) {
+        OH_AI_ModelDestroy(&model_handle);
+        printf("OH_AI_ModelPredict failed, ret: %d", ret);
         return ret;
       }
     }
@@ -255,23 +255,23 @@ int main(int argc, const char **argv) {
     float total_time = (float)(end_time - start_time) / 1000.0f;
     printf("total time: %.5fms, per time: %.5fms\n", total_time, total_time / loop_count);
   }
-  ret = MSModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL);
-  if (ret != kMSStatusSuccess) {
-    MSModelDestroy(&model_handle);
+  ret = OH_AI_ModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL);
+  if (ret != OH_AI_STATUS_SUCCESS) {
+    OH_AI_ModelDestroy(&model_handle);
     return ret;
   }
   printf("========run success=======\n");
   printf("\noutputs: \n");
   for (size_t i = 0; i < outputs_handle.handle_num; i++) {
-    MSTensorHandle output = outputs_handle.handle_list[i];
+    OH_AI_TensorHandle output = outputs_handle.handle_list[i];
     PrintTensorHandle(output);
   }
   if (argc >= 5) {
     CalibTensor *calib_tensors;
     int calib_num = 0;
     ret = ReadCalibData(argv[4], &calib_tensors, &calib_num);
-    if (ret != kMSStatusSuccess) {
-      MSModelDestroy(&model_handle);
+    if (ret != OH_AI_STATUS_SUCCESS) {
+      OH_AI_ModelDestroy(&model_handle);
       return ret;
     }
     float cosine_distance_threshold = 0.9999;
@@ -279,15 +279,15 @@ int main(int argc, const char **argv) {
       cosine_distance_threshold = atof(argv[8]);
     }
     ret = CompareOutputs(outputs_handle, &calib_tensors, calib_num, cosine_distance_threshold);
-    if (ret != kMSStatusSuccess) {
-      MSModelDestroy(&model_handle);
+    if (ret != OH_AI_STATUS_SUCCESS) {
+      OH_AI_ModelDestroy(&model_handle);
       return ret;
     }
     FreeCalibTensors(&calib_tensors, calib_num);
   }
   printf("========run success=======\n");
-  MSModelDestroy(&model_handle);
-  return kMSStatusSuccess;
+  OH_AI_ModelDestroy(&model_handle);
+  return OH_AI_STATUS_SUCCESS;
 }
 )RAW";

@@ -385,7 +385,7 @@ int benchmark() {
     return kMSStatusLiteError;
   }
   MSModelSetWorkspace(model_handle, g_WorkSpace, WORK_SPACE_SIZE);
-  ret = MSModelBuild(model_handle, NULL, 0, kMSModelTypeMindIR, NULL);
+  ret = OH_AI_ModelBuild(model_handle, NULL, 0, kMSModelTypeMindIR, NULL);
   if (ret != kMSStatusSuccess) {
     printf("MSModelBuildFromFile failed, ret : %d.\n", ret);
     MSModelDestroy(&model_handle);
@@ -424,7 +424,7 @@ int benchmark() {
   }

   printf("========Infer start=======\n");
-  ret = MSModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL);
+  ret = OH_AI_ModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL);
   if (ret != kMSStatusSuccess) {
     MSModelDestroy(&model_handle);
     return ret;
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/calib_output.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/calib_output.cc
index 71ca2287..66af9069 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/calib_output.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/calib_output.cc
@@ -48,7 +48,7 @@ typedef struct CalibTensor {
   float *data_;
 } CalibTensor;
 int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensots, int *calib_num);
-int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int calib_num,
+int CompareOutputs(OH_AI_TensorHandleArray outputs, CalibTensor **calib_tensors, int calib_num,
                    float cosine_distance_threshold);
 void FreeCalibTensors(CalibTensor **calib_tensors, int calib_num);

@@ -89,12 +89,12 @@ int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensor_pointe
   FILE *file = fopen(calib_data_path, "r");
   if (!file) {
     printf("Unable open %s", calib_data_path);
-    return kMSStatusLiteError;
+    return OH_AI_STATUS_LITE_ERROR;
   }
   CalibTensor *calib_tensors = (CalibTensor *)malloc(kMaxOutput * sizeof(CalibTensor));
   if(calib_tensors == NULL) {
     printf("Malloc calib tensors failed.");
-    return kMSStatusLiteError;
+    return OH_AI_STATUS_LITE_ERROR;
   }
   // read line by line
   char line[kMaxTensorSize];
@@ -111,7 +111,7 @@ int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensor_pointe
       char* tensor_name = (char *)malloc(strlen(p)+1);
       if(tensor_name == NULL) {
         printf("Malloc tensor name failed.");
-        return kMSStatusLiteError;
+        return OH_AI_STATUS_LITE_ERROR;
       }
       (void)strcpy(tensor_name, p);
       calib_tensors[*calib_num].tensor_name = tensor_name;
@@ -134,7 +134,7 @@ int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensor_pointe
       float *data = (float *)malloc(elements * sizeof(float));
       if(data == NULL) {
         printf("Malloc tensor data failed.");
-        return kMSStatusLiteError;
+        return OH_AI_STATUS_LITE_ERROR;
       }
       p = strtok(line, " ");
       int k = 0;
@@ -152,43 +152,43 @@ int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensor_pointe
   }
   *calib_tensor_pointers = calib_tensors;
   fclose(file);
-  return kMSStatusSuccess;
+  return OH_AI_STATUS_SUCCESS;
 }

-int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int calib_num,
+int CompareOutputs(OH_AI_TensorHandleArray outputs, CalibTensor **calib_tensors, int calib_num,
                    float cosine_distance_threshold) {
   if (outputs.handle_num != (size_t)calib_num) {
     printf("error, outputs and calibs size is mismatch\n");
-    return kMSStatusLiteError;
+    return OH_AI_STATUS_LITE_ERROR;
   }
   size_t outputs_num = outputs.handle_num;
   bool is_success = true;
   for (size_t i = 0; i < outputs_num; ++i) {
     MicroTensor *output = (MicroTensor *)outputs.handle_list[i];
     if (!output || !output->data) {
-      return kMSStatusLiteError;
+      return OH_AI_STATUS_LITE_ERROR;
     }
     CalibTensor *calib = calib_tensors[0];
     if (!calib || !calib[i].data_) {
-      return kMSStatusLiteError;
+      return OH_AI_STATUS_LITE_ERROR;
     }
     if (strcmp(output->name, calib[i].tensor_name) != 0) {
       printf("warning, output tensor name is not equal to calib\n");
     }
-    size_t elements = (size_t)MSTensorGetElementNum(output);
+    size_t elements = (size_t)OH_AI_TensorGetElementNum(output);
     if (elements != (size_t)calib[i].elemets_num_) {
       printf("error, output elements num is not equal to calib\n");
-      return kMSStatusLiteError;
+      return OH_AI_STATUS_LITE_ERROR;
     }
     float cosin = 0.f, dot = 0.f, normx = 0.f, normy = 0.f;
     switch (output->type) {
-      case kMSDataTypeNumberTypeFloat32: {
+      case OH_AI_DATATYPE_NUMBERTYPE_FLOAT32: {
         float *float_output = (float *)output->data;
         for (size_t j = 0; j < elements; ++j) {
           if (isnan(float_output[j]) || isinf(float_output[j]) || isnan(calib[i].data_[j]) ||
               isinf(calib[i].data_[j])) {
             printf("error, output data is nan or inf\n");
-            return kMSStatusLiteError;
+            return OH_AI_STATUS_LITE_ERROR;
           }
           dot += float_output[j] * calib[i].data_[j];
           normx += float_output[j] * float_output[j];
@@ -196,7 +196,7 @@ int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int
         }
         break;
       }
-      case kMSDataTypeNumberTypeInt8: {
+      case OH_AI_DATATYPE_NUMBERTYPE_INT8: {
         int8_t *int_output = (int8_t *)output->data;
         for (size_t j = 0; j < elements; ++j) {
           dot += (float) (int_output[j] * calib[i].data_[j]);
@@ -205,7 +205,7 @@ int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int
         }
         break;
       }
-      case kMSDataTypeNumberTypeUInt8: {
+      case OH_AI_DATATYPE_NUMBERTYPE_UINT8: {
         uint8_t *int_output = (uint8_t *)output->data;
         for (size_t j = 0; j < elements; ++j) {
           dot += (float) (int_output[j] * calib[i].data_[j]);
@@ -214,8 +214,8 @@ int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int
         }
         break;
       }
-      case kMSDataTypeNumberTypeInt32:
-      case kMSDataTypeNumberTypeUInt32: {
+      case OH_AI_DATATYPE_NUMBERTYPE_INT32:
+      case OH_AI_DATATYPE_NUMBERTYPE_UINT32: {
         int32_t *int_output = (int32_t *)output->data;
         for (size_t j = 0; j < elements; ++j) {
           dot += (float) (int_output[j] * calib[i].data_[j]);
@@ -238,10 +238,10 @@ int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int
   }
   if (!is_success) {
     printf("compare outputs failed.\n");
-    return kMSStatusLiteError;
+    return OH_AI_STATUS_LITE_ERROR;
   }
   printf("compare outputs success.\n");
-  return kMSStatusSuccess;
+  return OH_AI_STATUS_SUCCESS;
 }

 void FreeCalibTensors(CalibTensor **calib_tensors_pointers, int calib_num) {
@@ -328,7 +328,7 @@ const char *calib_source_cortex = R"RAW(/**
 int LoadCalibInputs(MSTensorHandleArray *inputs, TensorArray *tensor_array) {
   if (inputs->handle_num != tensor_array->tensors_size_) {
     printf("error, inputs and calibs size is mismatch.\n");
-    return kMSStatusLiteError;
+    return OH_AI_STATUS_LITE_ERROR;
   }
   Tensor *calib_tensors = tensor_array->tensors_;
   if (calib_tensors == NULL) {
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/cmake_lists.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/cmake_lists.cc
index 79bfc485..f63e6f9e 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/cmake_lists.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/cmake_lists.cc
@@ -127,9 +127,9 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default")
 else()
     message(STATUS "build benchmark release version")
-    set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -Werror -fstack-protector-strong -Wno-attributes \
+    set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -fstack-protector-strong -Wno-attributes \
     -Wno-deprecated-declarations -Wno-missing-braces ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -Werror -fstack-protector-strong -Wno-attributes \
+    set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -fstack-protector-strong -Wno-attributes \
     -Wno-deprecated-declarations -Wno-missing-braces -Wno-overloaded-virtual ${CMAKE_CXX_FLAGS}")
     string(REPLACE "-g" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
     string(REPLACE "-g" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
@@ -211,9 +211,9 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default")
 else()
     message(STATUS "build net library release version")
-    set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -Werror -fstack-protector-strong -Wno-attributes \
+    set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -fstack-protector-strong -Wno-attributes \
     -Wno-deprecated-declarations -Wno-missing-braces ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -Werror -fstack-protector-strong -Wno-attributes \
+    set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -fstack-protector-strong -Wno-attributes \
     -Wno-deprecated-declarations -Wno-missing-braces -Wno-overloaded-virtual ${CMAKE_CXX_FLAGS}")
     string(REPLACE "-g" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
     string(REPLACE "-g" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
@@ -241,11 +241,11 @@ function(create_library)
     endforeach()
     add_custom_command(TARGET net
             POST_BUILD
-            COMMAND ar cr ${library_name} *.o
+            COMMAND ar cr ${library_name} *.obj
             COMMAND ranlib ${library_name}
             COMMAND echo "new static library ${library_name} size:"
             COMMAND ls -lh ${library_name}
-            COMMAND rm -rf tmp && rm -rf *.o
+            COMMAND rm -rf tmp && rm -rf *.obj
             COMMENT "generate specified static library ${library_name}"
             )
 endfunction(create_library)
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/load_input.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/load_input.cc
index 9a2aeaa7..669cd8c1 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/load_input.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/load_input.cc
@@ -131,7 +131,7 @@ int ReadInputsFile(char *path, void **buffers, const int *inputs_size, int input
   while ((token = strtok_r(path, delim, &path))) {
     if (i >= inputs_num) {
       printf("inputs num is error, need: %d\n", inputs_num);
-      return kMSStatusLiteParamInvalid;
+      return OH_AI_STATUS_LITE_PARAM_INVALID;
     }
     inputs_path[i] = token;
     printf("input %d: %s\n", i, inputs_path[i]);
@@ -144,7 +144,7 @@ int ReadInputsFile(char *path, void **buffers, const int *inputs_size, int input
     if (size != inputs_size[i] || buffers[i] == NULL) {
       printf("size mismatch, %s, input: %d, needed: %d\n", inputs_path[i], size, inputs_size[i]);
       free(buffers[i]);
-      return kMSStatusLiteError;
+      return OH_AI_STATUS_LITE_ERROR;
     }
   }
   return 0;
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mcontext.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mcontext.cc
index 856de855..d662e3a8 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mcontext.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mcontext.cc
@@ -73,24 +73,24 @@ const char context_source_cortex[] = R"RAW(
 #include <stdlib.h>
 #include <string.h>

-MSContextHandle MSContextCreate() {
+OH_AI_ContextHandle OH_AI_ContextCreate() {
   return NULL;
 }

-void MSContextDestroy(MSContextHandle *context) {
+void OH_AI_ContextDestroy(OH_AI_ContextHandle *context) {
 }

-void MSContextSetThreadNum(MSContextHandle context, int32_t thread_num) {
+void OH_AI_ContextSetThreadNum(OH_AI_ContextHandle context, int32_t thread_num) {
 }

-int32_t MSContextGetThreadNum(const MSContextHandle context) {
+int32_t OH_AI_ContextGetThreadNum(const OH_AI_ContextHandle context) {
   return 1;
 }

-void MSContextSetThreadAffinityMode(MSContextHandle context, int mode) {
+void OH_AI_ContextSetThreadAffinityMode(OH_AI_ContextHandle context, int mode) {
 }

-int MSContextGetThreadAffinityMode(const MSContextHandle context) {
+int OH_AI_ContextGetThreadAffinityMode(const OH_AI_ContextHandle context) {
   return 0;
 }
 )RAW";
@@ -116,7 +116,7 @@ const char context_source_no_parallel[] = R"RAW(
 #include <stdlib.h>
 #include <string.h>

-MSContextHandle MSContextCreate() {
+OH_AI_ContextHandle OH_AI_ContextCreate() {
   MicroContext *micro_context = (MicroContext *)malloc(sizeof(MicroContext));
   if (micro_context == NULL) {
     return NULL;
@@ -129,7 +129,7 @@ MSContextHandle MSContextCreate() {
   return micro_context;
 }

-void MSContextDestroy(MSContextHandle *context) {
+void OH_AI_ContextDestroy(OH_AI_ContextHandle *context) {
   MicroContext *micro_context = (MicroContext *)(*context);
   if (micro_context) {
     free(micro_context);
@@ -137,17 +137,17 @@ void MSContextDestroy(MSContextHandle *context) {
   }
 }

-void MSContextSetThreadNum(MSContextHandle context, int32_t thread_num) {
+void OH_AI_ContextSetThreadNum(OH_AI_ContextHandle context, int32_t thread_num) {
 }

-int32_t MSContextGetThreadNum(const MSContextHandle context) {
+int32_t OH_AI_ContextGetThreadNum(const OH_AI_ContextHandle context) {
   return 1;
 }

-void MSContextSetThreadAffinityMode(MSContextHandle context, int mode) {
+void OH_AI_ContextSetThreadAffinityMode(OH_AI_ContextHandle context, int mode) {
 }

-int MSContextGetThreadAffinityMode(const MSContextHandle context) {
+int OH_AI_ContextGetThreadAffinityMode(const OH_AI_ContextHandle context) {
   return 0;
 }
 )RAW";
@@ -176,7 +176,7 @@ const char context_source[] = R"RAW(

 #define MAX_THREAD_NUM 4

-MSContextHandle MSContextCreate() {
+OH_AI_ContextHandle OH_AI_ContextCreate() {
   MicroContext *micro_context = (MicroContext *)malloc(sizeof(MicroContext));
   if (micro_context == NULL) {
     return NULL;
@@ -189,7 +189,7 @@ MSContextHandle MSContextCreate() {
   return micro_context;
 }

-void MSContextDestroy(MSContextHandle *context) {
+void OH_AI_ContextDestroy(OH_AI_ContextHandle *context) {
   MicroContext *micro_context = (MicroContext *)(*context);
   if (micro_context) {
     if (micro_context->affinity_core_list_) {
@@ -201,7 +201,7 @@ void MSContextDestroy(MSContextHandle *context) {
   }
 }

-void MSContextSetThreadNum(MSContextHandle context, int32_t thread_num) {
+void OH_AI_ContextSetThreadNum(OH_AI_ContextHandle context, int32_t thread_num) {
   MicroContext *micro_context = (MicroContext *)context;
   if (micro_context) {
     int core_num = GetCpuCoreNum();
@@ -214,7 +214,7 @@ void MSContextSetThreadNum(MSContextHandle context, int32_t thread_num) {
   }
 }

-int32_t MSContextGetThreadNum(const MSContextHandle context) {
+int32_t OH_AI_ContextGetThreadNum(const OH_AI_ContextHandle context) {
   MicroContext *micro_context = (MicroContext *)context;
   if (micro_context) {
     return micro_context->thread_num_;
@@ -222,7 +222,7 @@ int32_t MSContextGetThreadNum(const MSContextHandle context) {
   return 0;
 }

-void MSContextSetThreadAffinityMode(MSContextHandle context, int mode) {
+void OH_AI_ContextSetThreadAffinityMode(OH_AI_ContextHandle context, int mode) {
   MicroContext *micro_context = (MicroContext *)context;
   if (micro_context) {
     if (mode >= 0 && mode <= 2) {
@@ -233,7 +233,7 @@ void MSContextSetThreadAffinityMode(MSContextHandle context, int mode) {
   }
 }

-int MSContextGetThreadAffinityMode(const MSContextHandle context) {
+int OH_AI_ContextGetThreadAffinityMode(const OH_AI_ContextHandle context) {
   MicroContext *micro_context = (MicroContext *)context;
   if (micro_context) {
     return micro_context->affinity_mode;
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/msession.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/msession.cc
index 44273071..5cbe4507 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/msession.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/msession.cc
@@ -18,25 +18,25 @@

 namespace mindspore::lite::micro {
 const char model_runtime_other_source[] = R"RAW(
-MSTensorHandleArray MSModelGetInputs(const MSModelHandle model) {
+OH_AI_TensorHandleArray OH_AI_ModelGetInputs(const OH_AI_ModelHandle model) {
   MicroModel *micro_model = (MicroModel *)model;
   if (micro_model == NULL) {
-    MSTensorHandleArray tmp = {0, NULL};
+    OH_AI_TensorHandleArray tmp = {0, NULL};
     return tmp;
   }
   return micro_model->inputs;
 }

-MSTensorHandleArray MSModelGetOutputs(const MSModelHandle model) {
+OH_AI_TensorHandleArray OH_AI_ModelGetOutputs(const OH_AI_ModelHandle model) {
   MicroModel *micro_model = (MicroModel *)model;
   if (micro_model == NULL) {
-    MSTensorHandleArray tmp = {0, NULL};
+    OH_AI_TensorHandleArray tmp = {0, NULL};
     return tmp;
   }
   return micro_model->outputs;
 }

-MSTensorHandle MSModelGetInputByTensorName(const MSModelHandle model, const char *tensor_name) {
+OH_AI_TensorHandle OH_AI_ModelGetInputByTensorName(const OH_AI_ModelHandle model, const char *tensor_name) {
   MicroModel *micro_model = (MicroModel *)model;
   if (micro_model == NULL || micro_model->inputs.handle_list == NULL) {
     return NULL;
@@ -53,7 +53,7 @@ MSTensorHandle MSModelGetInputByTensorName(const MSModelHandle model, const char
   return NULL;
 }

-MSTensorHandle MSModelGetOutputByTensorName(const MSModelHandle model, const char *tensor_name) {
+OH_AI_TensorHandle OH_AI_ModelGetOutputByTensorName(const OH_AI_ModelHandle model, const char *tensor_name) {
   MicroModel *micro_model = (MicroModel *)model;
   if (micro_model == NULL || micro_model->outputs.handle_list == NULL) {
     return NULL;
@@ -70,9 +70,16 @@ MSTensorHandle MSModelGetOutputByTensorName(const MSModelHandle model, const cha
   return NULL;
 }

-MSStatus MSModelResize(MSModelHandle model, const MSTensorHandleArray inputs, MSShapeInfo *shape_infos,
+OH_AI_Status OH_AI_ModelResize(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs, OH_AI_ShapeInfo *shape_infos,
                        size_t shape_info_num) {
-  return kMSStatusLiteNotSupport;
+  MicroModel *micro_model = (MicroModel *)model;
+  if (micro_model == NULL) {
+    return OH_AI_STATUS_LITE_NULLPTR;
+  }
+  if (micro_model->resize == NULL) {
+    return OH_AI_STATUS_LITE_NULLPTR;
+  }
+  return micro_model->resize(model, inputs, shape_infos, shape_info_num);
 }

 )RAW";
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mtensor.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mtensor.cc
index b125b31d..e4581829 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mtensor.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mtensor.cc
@@ -46,8 +46,8 @@ const char tensor_header[] = R"RAW(
 #endif

 typedef struct {
-  enum MSDataType type;
-  enum MSFormat format;
+  enum OH_AI_DataType type;
+  enum OH_AI_Format format;
   char *name;
   int ndim;
   int64_t *shape;
@@ -76,7 +76,7 @@ enum TypeTransMode {
   TypeTransMode_MAX = TypeTransMode_UNSUPPORT
 };

-void *TransformInput(MSTensorHandle tensor, int expect_type, bool *type_changed);
+void *TransformInput(OH_AI_TensorHandle tensor, int expect_type, bool *type_changed);

 #ifdef ENABLE_FP16
 void Fp32CastToFp16(const float *input, float16_t *output, int number);
@@ -109,37 +109,37 @@ const char tensor_source[] = R"RAW(
 #include "string.h"
 #include "tensor.h"

-size_t DataTypeSize(const MSDataType type) {
+size_t DataTypeSize(const OH_AI_DataType type) {
   switch (type) {
-    case kMSDataTypeNumberTypeFloat64:
+    case OH_AI_DATATYPE_NUMBERTYPE_FLOAT64:
       return sizeof(double);
-    case kMSDataTypeNumberTypeFloat32:
+    case OH_AI_DATATYPE_NUMBERTYPE_FLOAT32:
       return sizeof(float);
-    case kMSDataTypeNumberTypeInt8:
+    case OH_AI_DATATYPE_NUMBERTYPE_INT8:
       return sizeof(int8_t);
-    case kMSDataTypeNumberTypeUInt8:
+    case OH_AI_DATATYPE_NUMBERTYPE_UINT8:
       return sizeof(uint8_t);
-    case kMSDataTypeNumberTypeFloat16:
-    case kMSDataTypeNumberTypeInt16:
+    case OH_AI_DATATYPE_NUMBERTYPE_FLOAT16:
+    case OH_AI_DATATYPE_NUMBERTYPE_INT16:
       return sizeof(int16_t);
-    case kMSDataTypeNumberTypeInt32:
+    case OH_AI_DATATYPE_NUMBERTYPE_INT32:
       return sizeof(int32_t);
-    case kMSDataTypeNumberTypeInt64:
+    case OH_AI_DATATYPE_NUMBERTYPE_INT64:
       return sizeof(int64_t);
-    case kMSDataTypeNumberTypeUInt16:
+    case OH_AI_DATATYPE_NUMBERTYPE_UINT16:
       return sizeof(uint16_t);
-    case kMSDataTypeNumberTypeUInt32:
+    case OH_AI_DATATYPE_NUMBERTYPE_UINT32:
       return sizeof(uint32_t);
-    case kMSDataTypeNumberTypeUInt64:
+    case OH_AI_DATATYPE_NUMBERTYPE_UINT64:
       return sizeof(uint64_t);
-    case kMSDataTypeObjectTypeString:
+    case OH_AI_DATATYPE_OBJECTTYPE_STRING:
       return sizeof(char);
     default:
       return 0;
   }
 }

-MSTensorHandle MSTensorCreate(const char *name, MSDataType type, const int64_t *shape, size_t shape_num,
+OH_AI_TensorHandle OH_AI_TensorCreate(const char *name, OH_AI_DataType type, const int64_t *shape, size_t shape_num,
                               const void *data, size_t data_len) {
   size_t data_type_len = DataTypeSize(type);
   size_t acc_sum = 1;
@@ -160,16 +160,16 @@ MSTensorHandle MSTensorCreate(const char *name, MSDataType type, const int64_t *
   memcpy(micro_tensor->data, data, data_len);
   micro_tensor->shape = malloc(shape_num * sizeof(int64_t));
   memcpy(micro_tensor->shape, shape, shape_num * sizeof(int64_t));
-  micro_tensor->format = kMSFormatNHWC;
+  micro_tensor->format = OH_AI_FORMAT_NHWC;
   return micro_tensor;
 }

-void MSTensorDestroy(MSTensorHandle *tensor) {
+void OH_AI_TensorDestroy(OH_AI_TensorHandle *tensor) {
   MicroTensor* micro_tensor = (MicroTensor*)(*tensor);
   free(micro_tensor);
 }

-void MSTensorSetName(MSTensorHandle tensor, const char *name) {
+void OH_AI_TensorSetName(OH_AI_TensorHandle tensor, const char *name) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   if(micro_tensor->name != NULL) {
     free(micro_tensor->name);
@@ -179,10 +179,10 @@ void MSTensorSetName(MSTensorHandle tensor, const char *name) {
   memcpy(micro_tensor->name, name, len + 1);
 }

-MSTensorHandle MSTensorClone(MSTensorHandle tensor) {
+OH_AI_TensorHandle OH_AI_TensorClone(OH_AI_TensorHandle tensor) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   MicroTensor *clone_tensor = malloc( sizeof(MicroTensor));
-  size_t tensor_data_size = MSTensorGetDataSize(micro_tensor);
+  size_t tensor_data_size = OH_AI_TensorGetDataSize(micro_tensor);
   clone_tensor->data = malloc(tensor_data_size);
   clone_tensor->owned = true;
   memcpy(clone_tensor->data,micro_tensor->data,tensor_data_size);
@@ -195,26 +195,26 @@ MSTensorHandle MSTensorClone(MSTensorHandle tensor) {
   clone_tensor->shape = clone_shape;
   char* clone_name = malloc(strlen(micro_tensor->name));
   strcpy(clone_name,micro_tensor->name);
-  clone_tensor->format = kMSFormatNHWC;
+  clone_tensor->format = OH_AI_FORMAT_NHWC;
   return clone_tensor;
 }

-const char *MSTensorGetName(const MSTensorHandle tensor) {
+const char *OH_AI_TensorGetName(const OH_AI_TensorHandle tensor) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   return micro_tensor->name;
 }

-void MSTensorSetDataType(MSTensorHandle tensor, MSDataType type) {
+void OH_AI_TensorSetDataType(OH_AI_TensorHandle tensor, OH_AI_DataType type) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   micro_tensor->type = type;
 }

-MSDataType MSTensorGetDataType(const MSTensorHandle tensor) {
+OH_AI_DataType OH_AI_TensorGetDataType(const OH_AI_TensorHandle tensor) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   return micro_tensor->type;
 }

-void MSTensorSetShape(MSTensorHandle tensor, const int64_t *shape, size_t shape_num) {
+void OH_AI_TensorSetShape(OH_AI_TensorHandle tensor, const int64_t *shape, size_t shape_num) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   if(micro_tensor->shape != NULL) {
     free(micro_tensor->shape);
@@ -224,23 +224,23 @@ void MSTensorSetShape(MSTensorHandle tensor, const int64_t *shape, size_t shape_
   memcpy(micro_tensor->shape, shape, shape_num * sizeof(int64_t));
 }

-const int64_t *MSTensorGetShape(const MSTensorHandle tensor, size_t *shape_num) {
+const int64_t *OH_AI_TensorGetShape(const OH_AI_TensorHandle tensor, size_t *shape_num) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   *shape_num =  micro_tensor->ndim;
   return micro_tensor->shape;
 }

-void MSTensorSetFormat(MSTensorHandle tensor, MSFormat format) {
+void OH_AI_TensorSetFormat(OH_AI_TensorHandle tensor, OH_AI_Format format) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   micro_tensor->format = format;
 }

-MSFormat MSTensorGetFormat(const MSTensorHandle tensor) {
+OH_AI_Format OH_AI_TensorGetFormat(const OH_AI_TensorHandle tensor) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   return micro_tensor->format;
 }

-void MSTensorSetData(MSTensorHandle tensor, void *data) {
+void OH_AI_TensorSetData(OH_AI_TensorHandle tensor, void *data) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   if (micro_tensor->data == data) {
     return;
@@ -254,23 +254,23 @@ void MSTensorSetData(MSTensorHandle tensor, void *data) {
   micro_tensor->data = data;
 }

-const void *MSTensorGetData(const MSTensorHandle tensor) {
+const void *OH_AI_TensorGetData(const OH_AI_TensorHandle tensor) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   return micro_tensor->data;
 }

-void *MSTensorGetMutableData(const MSTensorHandle tensor) {
+void *OH_AI_TensorGetMutableData(const OH_AI_TensorHandle tensor) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   if(micro_tensor->data) {
     return micro_tensor->data;
   }
-  void* data = malloc(MSTensorGetDataSize(tensor));
+  void* data = malloc(OH_AI_TensorGetDataSize(tensor));
   micro_tensor->owned = true;
   micro_tensor->data = data;
   return data;
 }

-int64_t MSTensorGetElementNum(const MSTensorHandle tensor) {
+int64_t OH_AI_TensorGetElementNum(const OH_AI_TensorHandle tensor) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   int64_t acc_sum = 1;
   for(int i=0;i< micro_tensor->ndim;i++) {
@@ -279,10 +279,10 @@ int64_t MSTensorGetElementNum(const MSTensorHandle tensor) {
   return acc_sum;
 }

-size_t MSTensorGetDataSize(const MSTensorHandle tensor) {
+size_t OH_AI_TensorGetDataSize(const OH_AI_TensorHandle tensor) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   size_t data_type_size = DataTypeSize(micro_tensor->type);
-  int64_t elements = MSTensorGetElementNum(tensor);
+  int64_t elements = OH_AI_TensorGetElementNum(tensor);
   return data_type_size * elements;
 }

@@ -300,16 +300,16 @@ void Fp16CastToFp32(const float16_t *input, float *output, int number) {
 }
 #endif

-void *TransformInput(MSTensorHandle tensor, int expect_type, bool *type_changed) {
+void *TransformInput(OH_AI_TensorHandle tensor, int expect_type, bool *type_changed) {
   MicroTensor* micro_tensor = (MicroTensor*)(tensor);
   int cur_type = micro_tensor->type;
   if (cur_type == expect_type) {
     return micro_tensor->data;
   }
   int type_trans_mode = TypeTransMode_MAX;
-  if (expect_type == kMSDataTypeNumberTypeFloat16 && cur_type == kMSDataTypeNumberTypeFloat32) {
+  if (expect_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT16 && cur_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT32) {
     type_trans_mode = TypeTransMode_FP32_TO_FP16;
-  } else if (expect_type == kMSDataTypeNumberTypeFloat32 && cur_type == kMSDataTypeNumberTypeFloat16) {
+  } else if (expect_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT32 && cur_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT16) {
     type_trans_mode = TypeTransMode_FP16_TO_FP32;
   }
   if (type_trans_mode == TypeTransMode_UNSUPPORT) {
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/weight_component.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/weight_component.cc
index ac958750..6a131b52 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/component/weight_component.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/component/weight_component.cc
@@ -61,6 +61,8 @@ void CodeWeightFileHeader(std::ofstream &ofs, const std::unique_ptr<CoderContext
       << "#include <string.h>\n"
       << "extern unsigned char *" << ctx->buffer_name() << ";\n"
       << "extern uint8_t *" << ctx->weight_name() << ";\n"
+      << "extern int *" << kShapePrefixName << ";\n"
+      << "extern int *" << kOffsetPrefixName << ";\n"
       << "enum STATUS {\n"
          "  RET_OK = 0,\n"
          "  RET_ERROR = 1,\n"
diff --git a/mindspore/lite/tools/converter/micro/coder/generator/generator.cc b/mindspore/lite/tools/converter/micro/coder/generator/generator.cc
index dd66c333..23009e17 100644
--- a/mindspore/lite/tools/converter/micro/coder/generator/generator.cc
+++ b/mindspore/lite/tools/converter/micro/coder/generator/generator.cc
@@ -43,20 +43,28 @@ const char micro_model_define_source[] = R"RAW(
 typedef struct {
   void *runtime_buffer;
   bool train_mode;  // true: train mode, false: eval mode
-  MSTensorHandleArray inputs;
-  MSTensorHandleArray outputs;
+  OH_AI_TensorHandleArray inputs;
+  OH_AI_TensorHandleArray outputs;
   ModelBuild build;
+  ModelResize resize;
   ModelSetWorkspace set_work_space;
   ModelCalcWorkspaceSize calc_work_space;
   FreeResource free_resource;
 )RAW";

 const char set_workspace_state[] = R"RAW(
-typedef void (*ModelSetWorkspace)(MSModelHandle model, void *workspace, size_t workspace_size);
+typedef void (*ModelSetWorkspace)(OH_AI_ModelHandle model, void *workspace, size_t workspace_size);
 )RAW";

 const char calc_workspace_state[] = R"RAW(
-typedef size_t (*ModelCalcWorkspaceSize)(MSModelHandle model);
+typedef size_t (*ModelCalcWorkspaceSize)(OH_AI_ModelHandle model);
+)RAW";
+
+const char model_resize[] = R"RAW(
+typedef OH_AI_Status (*ModelResize)(OH_AI_ModelHandle model,
+                                const OH_AI_TensorHandleArray inputs,
+                                OH_AI_ShapeInfo *shape_infos,
+                                size_t shape_info_num);
 )RAW";

 int WriteContentToFile(const std::string &file, const std::string &content) {
@@ -311,6 +319,7 @@ int Generator::CodeCommonModelFile() {
   CodeFreeResourceState(hofs);
   hofs << set_workspace_state;
   hofs << calc_workspace_state;
+  hofs << model_resize;
   hofs << micro_model_define_source;
   if (config_->code_mode() == CodeMode::Inference) {
     hofs << "  ModelPredict predict;\n";
@@ -321,7 +330,7 @@ int Generator::CodeCommonModelFile() {
   }
   hofs << "} MicroModel;\n";

-  hofs << "void MSTensorHandleArrayDestroy(MSTensorHandleArray inputs);\n";
+  hofs << "void MSTensorHandleArrayDestroy(OH_AI_TensorHandleArray inputs);\n";
   hofs << "#endif // MINDSPORE_LITE_MICRO_LIBRARY_SOURCE_MODEL_H_\n\n";

   // model source file
@@ -340,7 +349,7 @@ int Generator::CodeCommonModelFile() {
   if (config_->support_parallel()) {
     cofs << "#include \"" << kThreadWrapper << "\"\n";
   }
-  if (config_->target() != kCortex_M) {
+  if (config_->target() != kCortex_M && !config_->dynamic_shape()) {
     cofs << "#include \"src/allocator.h\"\n";
   }
   CodeMSModelCalcWorkspaceSize(cofs, ctx_, *config_);
@@ -369,7 +378,7 @@ int Generator::CodeModelHandleHFile() {
          "#define MINDSPORE_LITE_MICRO_LIBRARY_INCLUDE_MODEL_HANDLE_H_\n\n"
       << "#include \"c_api/model_c.h\"\n\n";
   for (int i = 0; i <= ctx_->GetCurModelIndex(); ++i) {
-    ofs << "extern MSModelHandle model" << std::to_string(i) << "; // " << ctx_->model_name() << "\n";
+    ofs << "extern OH_AI_ModelHandle model" << std::to_string(i) << "; // " << ctx_->model_name() << "\n";
   }
   ofs << "\n#endif  // MINDSPORE_LITE_MICRO_LIBRARY_INCLUDE_MODEL_HANDLE_H_\n";
   return RET_OK;
@@ -386,7 +395,7 @@ int Generator::CodeMSModelImplement() {
   ofs << "#include \"c_api/model_c.h\"\n";
   ofs << "#include \"src/model.h\"\n";
   ofs << "#include \"src/model" << ctx_->GetCurModelIndex() << "/" << net_inc_hfile_ << "\"\n";
-  if (config_->target() != kCortex_M) {
+  if (config_->target() != kCortex_M && !config_->dynamic_shape()) {
     ofs << "#include \"src/allocator.h\"\n";
   }
   if (config_->support_parallel()) {
@@ -399,33 +408,37 @@ int Generator::CodeMSModelImplement() {
     ofs << "#define GRAPH_OUTPUTS_SIZE " << ctx_->graph_outputs().size() << "\n";
     ofs << "#define WEIGHT_BUF_SIZE " << ctx_->weight_buffer_size() << "\n";
   }
-  ofs << "MSStatus MSModelBuild" << ctx_->GetCurModelIndex() << "(MSModelHandle model, const void *model_data,\n"
-      << "                       size_t data_size, const MSContextHandle model_context);\n";
+  ofs << "OH_AI_Status OH_AI_ModelBuild" << ctx_->GetCurModelIndex() << "(OH_AI_ModelHandle model, const void *model_data,\n"
+      << "                       size_t data_size, const OH_AI_ContextHandle model_context);\n";
+  ofs << "OH_AI_Status OH_AI_ModelResize" << ctx_->GetCurModelIndex() << "(OH_AI_ModelHandle model, \n"
+      << "                       const OH_AI_TensorHandleArray inputs, OH_AI_ShapeInfo *shape_infos, size_t shape_info_num);\n";
   if (config_->code_mode() == CodeMode::Inference) {
-    ofs << "MSStatus MSModelPredict" << ctx_->GetCurModelIndex()
-        << "(MSModelHandle model, const MSTensorHandleArray inputs,\n"
-        << "                         MSTensorHandleArray *output,\n"
-        << "                         const MSKernelCallBackC before,\n"
-        << "                         const MSKernelCallBackC after);\n";
+    ofs << "OH_AI_Status OH_AI_ModelPredict" << ctx_->GetCurModelIndex()
+        << "(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs,\n"
+        << "                         OH_AI_TensorHandleArray *output,\n"
+        << "                         const OH_AI_KernelCallBack before,\n"
+        << "                         const OH_AI_KernelCallBack after);\n";
   } else {
-    ofs << "MSStatus MSModelRunStep" << ctx_->GetCurModelIndex()
-        << "(MSModelHandle model,\n"
-           "                       const MSKernelCallBackC before,\n"
-           "                       const MSKernelCallBackC after);\n";
-    ofs << "MSStatus MSModelSetTrainMode" << ctx_->GetCurModelIndex() << "(MSModelHandle model, bool train);\n";
-    ofs << "MSStatus MSModelExportWeight" << ctx_->GetCurModelIndex()
-        << "(MSModelHandle model, const char *export_path);\n";
-  }
+    ofs << "OH_AI_Status MSModelRunStep" << ctx_->GetCurModelIndex()
+        << "(OH_AI_ModelHandle model,\n"
+           "                       const OH_AI_KernelCallBack before,\n"
+           "                       const OH_AI_KernelCallBack after);\n";
+    ofs << "OH_AI_Status MSModelSetTrainMode" << ctx_->GetCurModelIndex() << "(OH_AI_ModelHandle model, bool train);\n";
+    ofs << "OH_AI_Status MSModelExportWeight" << ctx_->GetCurModelIndex()
+        << "(OH_AI_ModelHandle model, const char *export_path);\n";
+  }
+  ofs << "void Reset" << ctx_->GetCurModelIndex() << "();\n";
   ofs << "void MSModelSetWorkspace" << ctx_->GetCurModelIndex()
-      << "(MSModelHandle model, void *workspace, size_t workspace_size);\n";
-  ofs << "size_t MSModelCalcWorkspaceSize" << ctx_->GetCurModelIndex() << "(MSModelHandle model);\n";
+      << "(OH_AI_ModelHandle model, void *workspace, size_t workspace_size);\n";
+  ofs << "size_t MSModelCalcWorkspaceSize" << ctx_->GetCurModelIndex() << "(OH_AI_ModelHandle model);\n";
   ofs << "static MicroModel gModel" << ctx_->GetCurModelIndex() << " = {.runtime_buffer = NULL,\n"
       << "                             .train_mode = false,\n"
       << "                             .inputs = {" << ctx_->graph_inputs().size() << ", NULL},\n"
       << "                             .outputs = {" << ctx_->graph_outputs().size() << ", NULL},\n"
-      << "                             .build = MSModelBuild" << ctx_->GetCurModelIndex() << ",\n";
+      << "                             .build = OH_AI_ModelBuild" << ctx_->GetCurModelIndex() << ",\n"
+      << "                             .resize = OH_AI_ModelResize" << ctx_->GetCurModelIndex() << ",\n";
   if (config_->code_mode() == CodeMode::Inference) {
-    ofs << "                             .predict = MSModelPredict" << ctx_->GetCurModelIndex() << ",\n";
+    ofs << "                             .predict = OH_AI_ModelPredict" << ctx_->GetCurModelIndex() << ",\n";
   } else {
     ofs << "                             .run_step = MSModelRunStep" << ctx_->GetCurModelIndex() << ",\n"
         << "                             .set_train_mode = MSModelSetTrainMode" << ctx_->GetCurModelIndex() << ",\n"
@@ -439,11 +452,16 @@ int Generator::CodeMSModelImplement() {
     ofs << "                             .set_work_space = NULL,\n"
         << "                             .calc_work_space = NULL,\n";
   }
-  ofs << "                             .free_resource = FreeResource" << ctx_->GetCurModelIndex() << "};\n";
-  ofs << "MSModelHandle model" << ctx_->GetCurModelIndex() << " = &gModel" << ctx_->GetCurModelIndex() << ";\n\n";
-
+  ofs << "                             .free_resource = Reset" << ctx_->GetCurModelIndex() << "};\n";
+  ofs << "OH_AI_ModelHandle model" << ctx_->GetCurModelIndex() << " = &gModel" << ctx_->GetCurModelIndex() << ";\n\n";
+  auto &dynamic_symbols = config_->dynamic_symbols();
+  for (size_t i = 0; i < dynamic_symbols.size(); ++i) {
+    ofs << "static int store" << ctx_->GetCurModelIndex() << "_" << i << " = -1;\n";
+  }
+  CodeResetImplement(ofs, ctx_, *config_);
   CodeMSModelCreate(ofs, ctx_, *config_);
   CodeMSModelBuild(ofs, ctx_->GetCurModelIndex(), weight_size_, *config_);
+  CodeMSModelResize(ofs, ctx_, *config_);
   CodeCopyOutputsImplement(ofs, ctx_);
   if (config_->target() == kCortex_M) {
     CodeCortexCalcWorkspaceSize(ofs, ctx_);
@@ -483,6 +501,8 @@ int Generator::CodeWeightFile() {
   if (config_->target() != kCortex_M) {
     cofs << "unsigned char *" << ctx_->buffer_name() << " = 0; \n";
     cofs << "unsigned char *" << ctx_->weight_name() << " = 0; \n";
+    cofs << "int *" << kShapePrefixName << " = 0; \n";
+    cofs << "int *" << kOffsetPrefixName << " = 0; \n";
     std::string net_file = model_dir_ + "net" + std::to_string(ctx_->GetCurModelIndex()) + ".bin";
     SaveDataToNet(ctx_, net_file, config_->keep_original_weight(), &weight_size_);
   } else {
@@ -598,8 +618,10 @@ int Generator::CreateCommonFiles() {
   MS_CHECK_RET_CODE(CodeStaticContent(), "code static content failed.");
   MS_CHECK_RET_CODE(CodeModelHandleHFile(), "code model_handle h file failed.");
   MS_CHECK_RET_CODE(CodeCommonModelFile(), "code common model file failed.");
+  if (!config_->dynamic_shape()) {
+    MS_CHECK_RET_CODE(CodeAllocatorFile(), "code allocator file failed.");
+  }
   MS_CHECK_RET_CODE(CodeRegKernelHFile(), "code registered kernel header file failed.");
-  MS_CHECK_RET_CODE(CodeAllocatorFile(), "code allocator file failed.");
   MS_CHECK_RET_CODE(CodeSourceCMakeFile(), "code net cmake file failed.");
   return RET_OK;
 }
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.cc
new file mode 100644
index 00000000..108ba227
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.cc
@@ -0,0 +1,116 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/base/reshape_dynamic_base_coder.h"
+#include <string>
+#include "coder/opcoders/serializers/serializer.h"
+#include "include/errorcode.h"
+#include "tools/common/string_util.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_ExpandDims;
+using mindspore::schema::PrimitiveType_Flatten;
+using mindspore::schema::PrimitiveType_FlattenGrad;
+using mindspore::schema::PrimitiveType_Reshape;
+using mindspore::schema::PrimitiveType_Squeeze;
+using mindspore::schema::PrimitiveType_Unsqueeze;
+
+namespace mindspore::lite::micro {
+int ReshapeDynamicBaseCoder::Prepare(CoderContext *const context) {
+  if (input_tensors_.size() == C2NUM) {
+    MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst(), RET_NOT_SUPPORT,
+                      "Currently, only support the first input of reshape is non-const when shape is dynamical.");
+
+    MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt32 ||
+                        input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt,
+                      RET_ERROR, "The data-type of Reshape's second input must be int.");
+  }
+  return RET_OK;
+}
+
+int ReshapeDynamicBaseCoder::DoCode(CoderContext *const context) {
+  Serializer coder;
+
+  int data_item_size = static_cast<int>(lite::DataTypeSize(input_tensor_->data_type()));
+  auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  int64_t const_part = 1;
+  std::string non_const_part;
+  for (const auto &item : in_shape) {
+    if (IsNumber(item)) {
+      const_part *= std::stoi(item);
+    } else {
+      if (!non_const_part.empty()) {
+        non_const_part += " * ";
+      }
+      non_const_part += item;
+    }
+  }
+  std::string size = std::to_string(const_part * data_item_size) + " * " + non_const_part;
+  std::string input_data = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_);
+  MS_CHECK_TRUE_MSG(!input_data.empty(), RET_ERROR, "pointer is not allocated by the allocator");
+  std::string output_data = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_);
+  MS_CHECK_TRUE_MSG(!output_data.empty(), RET_ERROR, "pointer is not allocated by the allocator");
+  coder.CodeFunction("memcpy", output_data, input_data, size);
+
+  context->AppendCode(coder.str());
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Reshape,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Reshape,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Reshape,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Reshape,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Flatten,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Flatten,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_Flatten,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_ExpandDims,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_ExpandDims,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_ExpandDims,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_ExpandDims,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_ExpandDims,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Squeeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Squeeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Squeeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Squeeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_Squeeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Unsqueeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Unsqueeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Unsqueeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Unsqueeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_Unsqueeze,
+                           CPUOpCoderCreator<ReshapeDynamicBaseCoder>)
+}  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.h
new file mode 100644
index 00000000..aaae22eb
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.h
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_RESHAPE_DYNAMIC_BASE_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_RESHAPE_DYNAMIC_BASE_CODER_H_
+
+#include "tools/converter/micro/coder/opcoders/op_coder.h"
+#include "tools/converter/micro/coder/shape_info_container.h"
+#include "tools/converter/micro/coder/dynamic_mem_manager.h"
+
+namespace mindspore::lite::micro {
+class ReshapeDynamicBaseCoder final : public OperatorCoder {
+ public:
+  ReshapeDynamicBaseCoder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                          const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~ReshapeDynamicBaseCoder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+};
+}  // namespace mindspore::lite::micro
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_RESHAPE_DYNAMIC_BASE_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.cc
new file mode 100644
index 00000000..4b2b0abe
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.cc
@@ -0,0 +1,115 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/base/strided_slice_dynamic_base_coder.h"
+#include <cmath>
+#include <string>
+#include "mindspore/lite/src/common/log_util.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/utils/coder_utils.h"
+#include "tools/common/string_util.h"
+#include "base/float16.h"
+
+using mindspore::schema::PrimitiveType_StridedSlice;
+
+namespace mindspore::lite::micro {
+namespace {
+size_t GetInnerSize(TypeId type_id, size_t inner_elements) {
+  switch (type_id) {
+    case kNumberTypeInt8:
+      return inner_elements * sizeof(int8_t);
+    case kNumberTypeFloat32:
+      return inner_elements * sizeof(float);
+    case kNumberTypeInt32:
+      return inner_elements * sizeof(int32_t);
+    case kNumberTypeFloat16:
+      return inner_elements * sizeof(float16);
+    default:
+      MS_LOG(ERROR) << "Not supported data type: " << type_id;
+      return 0;
+  }
+}
+}  // namespace
+
+int StridedSliceDynamicBaseCoder::Prepare(CoderContext *context) {
+  CHECK_LESS_RETURN(input_tensors_.size(), C2NUM);
+  for (size_t i = 1; i < input_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(input_tensors_[i]->IsConst(), RET_PARAM_INVALID,
+                      "The " << i << " input of strided slice should be const.");
+    MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeInt32, RET_PARAM_INVALID,
+                      "The " << i << " input tensor data type should be int32.");
+  }
+  CHECK_LESS_RETURN(output_tensors_.size(), C1NUM);
+  strided_slice_param_ = reinterpret_cast<StridedSliceParameter *>(parameter_);
+  CHECK_NULL_RETURN(strided_slice_param_);
+  auto begin_tensor = input_tensors_.at(1);
+  input_shape_ = shape_info_container_->GetTemplateShape(input_tensor_);
+  if (input_shape_.size() > DIMENSION_8D || begin_tensor->shape().size() > DIMENSION_8D) {
+    MS_LOG(ERROR) << "StridedSlice not support input rank or begin num exceeds " << DIMENSION_8D;
+    return RET_ERROR;
+  }
+  dynamic_param_.in_shape_ = "{";
+  for (size_t i = 0; i < input_shape_.size(); ++i) {
+    dynamic_param_.in_shape_ += input_shape_[i] + ", ";
+  }
+  dynamic_param_.in_shape_ += "}";
+  return RET_OK;
+}
+
+int StridedSliceDynamicBaseCoder::DoCode(CoderContext *ctx) {
+  inner_size_ = GetInnerSize(input_tensor_->data_type(), inner_);
+  Collect(ctx,
+          {
+            "nnacl/fp32/strided_slice_fp32.h",
+          },
+          {
+            "strided_slice_fp32.c",
+          });
+  switch (input_tensor_->data_type()) {
+    case kNumberTypeInt8:
+      strided_slice_param_->data_type = ::kNumberTypeInt8;
+      break;
+    case kNumberTypeFloat32:
+      strided_slice_param_->data_type = ::kNumberTypeFloat32;
+      break;
+    case kNumberTypeInt32:
+      strided_slice_param_->data_type = ::kNumberTypeInt32;
+      break;
+    case kNumberTypeFloat16:
+      strided_slice_param_->data_type = ::kNumberTypeFloat16;
+      break;
+    default:
+      MS_LOG(ERROR) << "Not supported data type: " << input_tensor_->data_type();
+      return RET_ERROR;
+  }
+  nnacl::NNaclFp32Serializer code;
+  code.CodeStruct("strided_slice_parameter", *strided_slice_param_, dynamic_param_);
+  std::string input_data = GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  std::string output_data = GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  code.CodeFunction("DoStridedSlice", input_data, output_data, "&strided_slice_parameter");
+  ctx->AppendCode(code.str());
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_StridedSlice,
+                           CPUOpCoderCreator<StridedSliceDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat16, PrimitiveType_StridedSlice,
+                           CPUOpCoderCreator<StridedSliceDynamicBaseCoder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_StridedSlice,
+                           CPUOpCoderCreator<StridedSliceDynamicBaseCoder>)
+}  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.h
new file mode 100644
index 00000000..d41cff4f
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_STRIDED_SLICE_BASE_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_STRIDED_SLICE_BASE_CODER_H_
+#include <vector>
+#include "coder/opcoders/op_coder.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h"
+#include "nnacl/strided_slice_parameter.h"
+
+namespace mindspore::lite::micro {
+class StridedSliceDynamicBaseCoder final : public OperatorCoder {
+ public:
+  StridedSliceDynamicBaseCoder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                               const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~StridedSliceDynamicBaseCoder() override = default;
+
+  int Prepare(CoderContext *context) override;
+
+  int DoCode(CoderContext *context) override;
+
+ private:
+  StridedSliceParameter *strided_slice_param_{nullptr};
+  StridedSliceDynamicParameter dynamic_param_;
+  size_t inner_{1};
+  size_t inner_size_{1};
+  std::vector<std::string> input_shape_;
+  std::vector<std::string> output_shape_;
+};
+}  // namespace mindspore::lite::micro
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_STRIDED_SLICE_BASE_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h
new file mode 100644
index 00000000..1e9e4f8d
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_ARITHMETIC_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_ARITHMETIC_DYNAMIC_PARAMETER_H_
+#include <string>
+
+typedef struct ArithmeticDynamicParameter {
+  std::string in_shape0_;
+  std::string in_elements_num0_;
+  std::string in_shape1_;
+  std::string in_elements_num1_;
+
+  std::string out_shape_;
+  std::string out_elements_num_;
+
+  std::string in_strides0_;
+  std::string in_strides1_;
+  std::string out_strides_;
+
+  std::string multiples0_;
+  std::string multiples1_;
+} ArithmeticDynamicParameter;
+
+typedef struct BroadcastDynamicShapeInfo {
+  std::string input_shape_;
+  std::string output_shape_;
+} BroadcastDynamicShapeInfo;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_ARITHMETIC_DYNAMIC_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h
new file mode 100644
index 00000000..a05ab848
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_CONV_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_CONV_DYNAMIC_PARAMETER_H_
+#include <string>
+
+typedef struct ConvDynamicParameter {
+  std::string input_batch_;
+  std::string output_batch_;
+} ConvDynamicParameter;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_CONV_DYNAMIC_PARAMETER_H_
\ No newline at end of file
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h
new file mode 100644
index 00000000..970a863a
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_DYNAMIC_LSTM_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_DYNAMIC_LSTM_PARAMETER_H_
+
+typedef struct DynamicLstmParameter {
+  std::string seq_len_;
+  std::string batch_;
+  std::string input_row_align_;
+  std::string state_row_align_;
+  std::string output_step_;
+} DynamicLstmParameter;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_DYNAMIC_LSTM_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/matmul_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/matmul_dynamic_parameter.h
new file mode 100644
index 00000000..d99b0cf9
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/matmul_dynamic_parameter.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_MATMUL_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_MATMUL_DYNAMIC_PARAMETER_H_
+
+typedef struct MatmulDynamicParameter {
+  std::string row_;
+  std::string batch_;
+} MatmulDynamicParameter;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_MATMUL_DYNAMIC_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h
new file mode 100644
index 00000000..f2636e55
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_POOLING_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_POOLING_DYNAMIC_PARAMETER_H_
+#include <string>
+
+typedef struct PoolingDynamicParameter {
+  int avg_mode_;
+  bool global_;
+  int window_w_;
+  int window_h_;
+  int stride_w_;
+  int stride_h_;
+
+  std::string input_batch_;
+  std::string output_batch_;
+} PoolingDynamicParameter;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_POOLING_DYNAMIC_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h
new file mode 100644
index 00000000..e8728383
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SCALE_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SCALE_DYNAMIC_PARAMETER_H_
+#include <string>
+
+typedef struct ScaleDynamicParameter {
+  std::string outer_size_;
+  std::string axis_size_;
+  std::string inner_size_;
+} ScaleDynamicParameter;
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SCALE_DYNAMIC_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h
new file mode 100644
index 00000000..f17993d4
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SLICE_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SLICE_DYNAMIC_PARAMETER_H_
+#include <string>
+
+typedef struct SliceDynamicParameter {
+  std::string shape_;
+  std::string size_;
+  std::string end_;
+} SliceDynamicParameter;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SLICE_DYNAMIC_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/softmax_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/softmax_dynamic_parameter.h
new file mode 100644
index 00000000..92dfaf21
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/softmax_dynamic_parameter.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SOFTMAX_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SOFTMAX_DYNAMIC_PARAMETER_H_
+#include <string>
+
+typedef struct SoftmaxDynamicParameter {
+  std::string input_shape_;
+  std::string element_size_;
+} SoftmaxDynamicParameter;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SOFTMAX_DYNAMIC_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h
new file mode 100644
index 00000000..b97097ad
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SPLIT_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SPLIT_DYNAMIC_PARAMETER_H_
+#include <string>
+
+typedef struct SplitDynamicParameter {
+  std::string strides_;
+  std::string split_count_;
+} SplitDynamicParameter;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SPLIT_DYNAMIC_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h
new file mode 100644
index 00000000..202ee7dd
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_STRIDED_SLICE_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_STRIDED_SLICE_DYNAMIC_PARAMETER_H_
+#include <string>
+
+typedef struct StridedSliceDynamicParameter {
+  std::string in_shape_;
+} StridedSliceDynamicParameter;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_STRIDED_SLICE_DYNAMIC_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h
new file mode 100644
index 00000000..ed4f21f2
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_TRANSPOSE_DYNAMIC_PARAMETER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_TRANSPOSE_DYNAMIC_PARAMETER_H_
+#include <string>
+
+typedef struct TransposeDynamicParameter {
+  // shape correlative
+  std::string strides_;
+  std::string out_strides_;
+  std::string data_num_;
+} TransposeDynamicParameter;
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_TRANSPOSE_DYNAMIC_PARAMETER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..86048179
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.cc
@@ -0,0 +1,93 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/utils/coder_utils.h"
+#include "tools/common/string_util.h"
+
+using mindspore::schema::PrimitiveType_Activation;
+
+namespace mindspore::lite::micro::nnacl {
+int ActivationDynamicFP16Coder::Prepare(CoderContext *const context) {
+  MS_CHECK_TRUE_MSG(input_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                    "Input tensor data type is invalid.");
+  MS_CHECK_TRUE_MSG(output_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                    "Output tensor data type is invalid.");
+  return RET_OK;
+}
+
+int ActivationDynamicFP16Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/fp16/activation_fp16.h",
+          },
+          {
+            "activation_fp16.c",
+          });
+  NNaclFp32Serializer code;
+  // attribute
+  auto *activation_parameter = reinterpret_cast<ActivationParameter *>(parameter_);
+  MS_CHECK_PTR(activation_parameter);
+  auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  count_ = AccumulateShape(in_shape, 0, in_shape.size());
+  input_data_ = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_);
+  MS_CHECK_TRUE_MSG(!input_data_.empty(), RET_ERROR, "pointer is not allocated by the allocator");
+  output_data_ = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_);
+  MS_CHECK_TRUE_MSG(!output_data_.empty(), RET_ERROR, "pointer is not allocated by the allocator");
+  input_data_ = "(float16_t *)(" + input_data_ + ")";
+  output_data_ = "(float16_t *)(" + output_data_ + ")";
+
+  switch (activation_parameter->type_) {
+    case schema::ActivationType_RELU:
+      code.CodeFunction("ReluFp16", input_data_, output_data_, count_);
+      break;
+    case schema::ActivationType_RELU6:
+      code.CodeFunction("Relu6Fp16", input_data_, output_data_, count_);
+      break;
+    case schema::ActivationType_LEAKY_RELU:
+      code.CodeFunction("LReluFp16", input_data_, output_data_, count_, activation_parameter->alpha_);
+      break;
+    case schema::ActivationType_SIGMOID:
+      code.CodeFunction("SigmoidFp16", input_data_, output_data_, count_);
+      break;
+    case schema::ActivationType_TANH:
+      code.CodeFunction("TanhFp16", input_data_, output_data_, count_);
+      break;
+    case schema::ActivationType_HSWISH:
+      code.CodeFunction("HSwishFp16", input_data_, output_data_, count_);
+      break;
+    case schema::ActivationType_SWISH:
+      code.CodeFunction("SwishFp16", input_data_, output_data_, count_);
+      break;
+    case schema::ActivationType_HSIGMOID:
+      code.CodeFunction("HSigmoidFp16", input_data_, output_data_, count_);
+      break;
+    case schema::ActivationType_ELU:
+      code.CodeFunction("EluFp16", input_data_, output_data_, count_, activation_parameter->alpha_);
+      break;
+    default:
+      MS_LOG(ERROR) << "Activation type error";
+      return RET_ERROR;
+  }
+  MS_LOG(DEBUG) << "ActivationFP16Code has been called";
+  context->AppendCode(code.str());
+  return lite::RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Activation,
+                           CPUOpCoderCreator<ActivationDynamicFP16Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.h
new file mode 100644
index 00000000..c881567f
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ACTIVATION_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ACTIVATION_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include "tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h"
+
+namespace mindspore::lite::micro::nnacl {
+class ActivationDynamicFP16Coder final : public ActivationDynamicFP32Coder {
+ public:
+  ActivationDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                             const LiteGraph::Node *node, size_t node_index, Target target)
+      : ActivationDynamicFP32Coder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~ActivationDynamicFP16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ACTIVATION_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..7050b8b0
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.cc
@@ -0,0 +1,369 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/log.h"
+#include "coder/utils/coder_utils.h"
+#include "tools/common/string_util.h"
+
+namespace mindspore::lite::micro::nnacl {
+namespace {
+std::string wrap_void(const std::string &a) { return "(void *)(" + a + ")"; }
+}  // namespace
+
+void ArithmeticDynamicFP16Coder::InitFunTable() {
+  fun_table_ = {
+    {PrimitiveType_MulFusion, schema::ActivationType_RELU, "ElementMulReluFp16", "", "", "", ""},
+    {PrimitiveType_MulFusion, schema::ActivationType_RELU6, "ElementMulRelu6Fp16", "", "", "", ""},
+    {PrimitiveType_MulFusion, schema::ActivationType_NO_ACTIVATION, "ElementMulFp16", "", "", "", ""},
+    {PrimitiveType_AddFusion, schema::ActivationType_RELU, "ElementAddReluFp16", "", "", "", ""},
+    {PrimitiveType_AddFusion, schema::ActivationType_RELU6, "ElementAddRelu6Fp16", "", "", "", ""},
+    {PrimitiveType_AddFusion, schema::ActivationType_NO_ACTIVATION, "ElementAddFp16", "", "", "", ""},
+    {PrimitiveType_SubFusion, schema::ActivationType_RELU, "ElementSubReluFp16", "", "", "", ""},
+    {PrimitiveType_SubFusion, schema::ActivationType_RELU6, "ElementSubRelu6Fp16", "", "", "", ""},
+    {PrimitiveType_SubFusion, schema::ActivationType_NO_ACTIVATION, "ElementSubFp16", "", "", "", ""},
+    {PrimitiveType_DivFusion, schema::ActivationType_RELU, "ElementDivReluFp16", "", "", "", ""},
+    {PrimitiveType_DivFusion, schema::ActivationType_RELU6, "ElementDivRelu6Fp16", "", "", "", ""},
+    {PrimitiveType_DivFusion, schema::ActivationType_NO_ACTIVATION, "ElementDivFp16", "", "", "", ""},
+    {PrimitiveType_RealDiv, schema::ActivationType_RELU, "ElementDivReluFp16", "", "", "", ""},
+    {PrimitiveType_RealDiv, schema::ActivationType_RELU6, "ElementDivRelu6Fp16", "", "", "", ""},
+    {PrimitiveType_RealDiv, schema::ActivationType_NO_ACTIVATION, "ElementDivFp16", "", "", "", ""},
+    {PrimitiveType_LogicalAnd, schema::ActivationType_NO_ACTIVATION, "ElementLogicalAndFp16", "", "", "", ""},
+    {PrimitiveType_LogicalOr, schema::ActivationType_NO_ACTIVATION, "ElementLogicalOrFp16", "", "", "", ""},
+    {PrimitiveType_Maximum, schema::ActivationType_NO_ACTIVATION, "ElementMaximumFp16", "", "", "", ""},
+    {PrimitiveType_Minimum, schema::ActivationType_NO_ACTIVATION, "ElementMinimumFp16", "", "", "", ""},
+    {PrimitiveType_FloorMod, schema::ActivationType_NO_ACTIVATION, "ElementFloorModFp16", "", "", "", ""},
+    {PrimitiveType_FloorDiv, schema::ActivationType_NO_ACTIVATION, "ElementFloorDivFp16", "", "", "", ""},
+    {PrimitiveType_SquaredDifference, schema::ActivationType_NO_ACTIVATION, "ElementSquaredDifferenceFp16", "", "", "",
+     ""}};
+}
+
+int ArithmeticDynamicFP16Coder::Prepare(CoderContext *const context) {
+  CHECK_LESS_RETURN(input_tensors_.size(), C2NUM);
+  CHECK_LESS_RETURN(output_tensors_.size(), 1);
+  for (size_t i = 0; i < input_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                      "Tensor data type is invalid");
+  }
+  MS_CHECK_TRUE_MSG(output_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                    "Tensor data type is invalid");
+  filter_tensor_ = input_tensors_.at(SECOND_INPUT);
+  MS_CHECK_PTR(filter_tensor_);
+  param_ = reinterpret_cast<ArithmeticParameter *>(parameter_);
+  MS_CHECK_PTR(param_);
+  auto primitive_type = param_->op_parameter_.type_;
+  if (primitive_type == schema::PrimitiveType_Eltwise) {
+    switch (param_->eltwise_mode_) {
+      case schema::EltwiseMode_PROD:
+        primitive_type = schema::PrimitiveType_MulFusion;
+        break;
+      case schema::EltwiseMode_SUM:
+        primitive_type = schema::PrimitiveType_AddFusion;
+        break;
+      case schema::EltwiseMode_MAXIMUM:
+        primitive_type = schema::PrimitiveType_Maximum;
+        break;
+      default:
+        MS_LOG(ERROR) << "Eltwise mode not support, mode:" << param_->eltwise_mode_;
+        return RET_ERROR;
+    }
+  }
+  InitRunFunction(primitive_type);
+  InitDynamicParams();
+  ResetStatus();
+  CalcMultiplesAndStrides();
+  return RET_OK;
+}
+
+int ArithmeticDynamicFP16Coder::DoCode(CoderContext *const context) {
+  input0_ptr_str_ = GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  input1_ptr_str_ = GetTensorAddr(filter_tensor_, filter_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  output_ptr_str_ = GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  NNaclFp32Serializer code;
+  Collect(context,
+          {
+            "nnacl/fp16/arithmetic_fp16.h",
+            "nnacl/base/broadcast_to.h",
+          },
+          {
+            "arithmetic_fp16.c",
+            "arithmetic_base.c",
+            "broadcast_to.c",
+          });
+
+  // all elements eltwise calculation
+  arithmetic_func_str_ = wrap_void(arithmetic_run_);
+  // run broadcast
+  auto in0_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  std::vector<std::string> in1_shape;
+  if (filter_tensor_->IsConst()) {
+    for (auto dim : filter_tensor_->shape()) {
+      in1_shape.emplace_back(std::to_string(dim));
+    }
+  } else {
+    in1_shape = shape_info_container_->GetTemplateShape(filter_tensor_);
+  }
+  auto out_shape = shape_info_container_->GetTemplateShape(output_tensor_);
+  broadcast_info_.output_shape_size_ = static_cast<int>(out_shape_.size());
+  if (in0_shape != out_shape) {
+    broadcast_info_.input_shape_size_ = static_cast<int>(in0_shape.size());
+    dynamic_shape_info_.input_shape_ = dynamic_param_.in_shape0_;
+    dynamic_shape_info_.output_shape_ = dynamic_param_.out_shape_;
+    code.CodeStruct("in0_broadcast_info", broadcast_info_, dynamic_shape_info_);
+    code.CodeFunction("BroadcastToSize16", input0_ptr_str_, "&in0_broadcast_info", output_ptr_str_);
+    input0_ptr_str_ = output_ptr_str_;
+  }
+  if (in1_shape != out_shape) {
+    broadcast_info_.input_shape_size_ = static_cast<int>(in1_shape.size());
+    dynamic_shape_info_.input_shape_ = dynamic_param_.in_shape1_;
+    dynamic_shape_info_.output_shape_ = dynamic_param_.out_shape_;
+    code.CodeStruct("in1_broadcast_info", broadcast_info_, dynamic_shape_info_);
+    auto temp = output_ptr_str_;
+    if (input0_ptr_str_ == output_ptr_str_) {
+      std::map<std::string, std::vector<int>> real_nums;
+      size_t scene_num = 0;
+      for (auto &dim_template : out_shape) {
+        auto dim_nums = shape_info_container_->GetRealNums(dim_template);
+        MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0.");
+        real_nums[dim_template] = dim_nums;
+        scene_num = std::max(scene_num, dim_nums.size());
+      }
+      for (size_t i = 0; i < scene_num; ++i) {
+        int out_element_num = 1;
+        for (size_t j = 0; j < out_shape.size(); ++j) {
+          if (IsNumber(out_shape[j])) {
+            out_element_num *= std::stoi(out_shape[j]);
+          } else {
+            out_element_num *= real_nums[out_shape[j]][i % real_nums[out_shape[j]].size()];
+          }
+        }
+        int workspace = out_element_num * DataTypeSize(kNumberTypeFloat16);
+        temp = dynamic_mem_manager_->AllocWorkSpace(workspace, i);
+        MS_CHECK_TRUE_MSG(!temp.empty(), RET_ERROR, "Arithmetic cannot alloc workspace.");
+      }
+    }
+    code.CodeFunction("BroadcastToSize16", input1_ptr_str_, "&in1_broadcast_info", temp);
+    input1_ptr_str_ = temp;
+  }
+  return ExecuteCode("(float16_t *)(" + input0_ptr_str_ + ")", "(float16_t *)(" + input1_ptr_str_ + ")",
+                     "(float16_t *)(" + output_ptr_str_ + ")", dynamic_param_.out_elements_num_, context, &code);
+}
+
+void ArithmeticDynamicFP16Coder::InitDynamicParams() {
+  auto in0_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  std::vector<std::string> in1_shape;
+  if (filter_tensor_->IsConst()) {
+    for (auto dim : filter_tensor_->shape()) {
+      in1_shape.emplace_back(std::to_string(dim));
+    }
+  } else {
+    in1_shape = shape_info_container_->GetTemplateShape(filter_tensor_);
+  }
+  auto out_shape = shape_info_container_->GetTemplateShape(output_tensor_);
+  dynamic_param_.in_shape0_ = "{";
+  dynamic_param_.in_shape1_ = "{";
+  dynamic_param_.out_shape_ = "{";
+  for (auto shape : in0_shape) {
+    dynamic_param_.in_shape0_ += shape + ", ";
+  }
+  for (auto shape : in1_shape) {
+    dynamic_param_.in_shape1_ += shape + ", ";
+  }
+  for (auto shape : out_shape) {
+    dynamic_param_.out_shape_ += shape + ", ";
+  }
+  dynamic_param_.in_shape0_ += "}";
+  dynamic_param_.in_shape1_ += "}";
+  dynamic_param_.out_shape_ += "}";
+  dynamic_param_.in_elements_num0_ = AccumulateShape(in0_shape, 0, in0_shape.size());
+  dynamic_param_.in_elements_num1_ = AccumulateShape(in1_shape, 0, in1_shape.size());
+  dynamic_param_.out_elements_num_ = AccumulateShape(out_shape, 0, out_shape.size());
+}
+
+void ArithmeticDynamicFP16Coder::InitRunFunction(int primitive_type) {
+  InitFunTable();
+  for (size_t i = 0; i < fun_table_.size(); i++) {
+    if (fun_table_[i].primitive_type_ == primitive_type && fun_table_[i].activation_type_ == param_->activation_type_) {
+      arithmetic_run_ = fun_table_[i].func_;
+      arithmetic_run_int_ = fun_table_[i].int_func_;
+      arithmetic_run_bool_ = fun_table_[i].bool_func_;
+      arithmetic_opt_run_ = fun_table_[i].opt_func_;
+      arithmetic_opt_run_int_ = fun_table_[i].opt_int_func_;
+    }
+  }
+  arithmetic_func_type_ = kArithmeticFuncFloat;
+}
+
+void ArithmeticDynamicFP16Coder::ResetStatus() {
+  auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  std::vector<std::string> filter_shape;
+  if (filter_tensor_->IsConst()) {
+    for (auto dim : filter_tensor_->shape()) {
+      filter_shape.emplace_back(std::to_string(dim));
+    }
+  } else {
+    filter_shape = shape_info_container_->GetTemplateShape(filter_tensor_);
+  }
+  auto dim_num = input_shape.size() >= filter_shape.size() ? input_shape.size() : filter_shape.size();
+  for (size_t i = 0; i < dim_num - input_shape.size(); ++i) {
+    in0_shape_.emplace_back("1");
+  }
+  in0_shape_.insert(in0_shape_.end(), input_shape.begin(), input_shape.end());
+  for (size_t i = 0; i < dim_num - filter_shape.size(); ++i) {
+    in1_shape_.emplace_back("1");
+  }
+  in1_shape_.insert(in1_shape_.end(), filter_shape.begin(), filter_shape.end());
+}
+
+void ArithmeticDynamicFP16Coder::CalcMultiplesAndStrides() {
+  out_shape_ = shape_info_container_->GetTemplateShape(output_tensor_);
+  dynamic_param_.multiples0_ = "{";
+  dynamic_param_.multiples1_ = "{";
+  for (size_t i = 0; i < param_->ndim_; i++) {
+    if (in0_shape_[i] != "0") {
+      dynamic_param_.multiples0_ += out_shape_[i] + " / " + in0_shape_[i] + ", ";
+    }
+    if (in1_shape_[i] != "0") {
+      dynamic_param_.multiples1_ += out_shape_[i] + " / " + in1_shape_[i] + ", ";
+    }
+  }
+  dynamic_param_.multiples0_ += "}";
+  dynamic_param_.multiples1_ += "}";
+
+  // cal strides
+  in0_strides_.resize(param_->ndim_);
+  in1_strides_.resize(param_->ndim_);
+  out_strides_.resize(param_->ndim_);
+  ComputeStrides(in0_shape_, in0_strides_);
+  ComputeStrides(in1_shape_, in1_strides_);
+  ComputeStrides(out_shape_, out_strides_);
+  dynamic_param_.in_strides0_ = "{";
+  dynamic_param_.in_strides1_ = "{";
+  dynamic_param_.out_strides_ = "{";
+  for (size_t i = 0; i < param_->ndim_; ++i) {
+    dynamic_param_.in_strides0_ += in0_strides_[i] + ", ";
+    dynamic_param_.in_strides1_ += in1_strides_[i] + ", ";
+    dynamic_param_.out_strides_ += out_strides_[i] + ", ";
+  }
+  dynamic_param_.in_strides0_ += "}";
+  dynamic_param_.in_strides1_ += "}";
+  dynamic_param_.out_strides_ += "}";
+}
+
+void ArithmeticDynamicFP16Coder::ComputeStrides(const std::vector<std::string> &shape,
+                                                std::vector<std::string> &strides) {
+  std::string stride = "1";
+  for (int i = param_->ndim_ - 1; i >= 0; i--) {
+    strides[i] = stride;
+    stride += "*=" + shape[i];
+  }
+}
+
+int ArithmeticDynamicFP16Coder::ExecuteCode(const std::string &input0, const std::string &input1,
+                                            const std::string &output, const std::string size,
+                                            CoderContext *const context, NNaclFp32Serializer *const code) {
+  if (arithmetic_func_str_.empty()) {
+    return RET_ERROR;
+  }
+  for (size_t i = 0; i < fun_table_.size(); i++) {
+    if (fun_table_[i].primitive_type_ == param_->op_parameter_.type_ &&
+        fun_table_[i].activation_type_ == param_->activation_type_) {
+      code->CodeFunction(fun_table_[i].func_, input0, input1, output, size);
+      break;
+    }
+  }
+  context->AppendCode(code->str());
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_AddFusion,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_MulFusion,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_SubFusion,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_DivFusion,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_RealDiv,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_LogicalAnd,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_LogicalOr,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Maximum,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Minimum,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_FloorDiv,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_FloorMod,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_SquaredDifference,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Equal,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_NotEqual,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Less,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_LessEqual,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Greater,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_GreaterEqual,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Eltwise,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_AddFusion,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_MulFusion,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_SubFusion,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_DivFusion,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_RealDiv,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LogicalAnd,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LogicalOr,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Maximum,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Minimum,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_FloorDiv,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_FloorMod,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_SquaredDifference,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Equal,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_NotEqual,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Less,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LessEqual,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Greater,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_GreaterEqual,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Eltwise,
+                           CPUOpCoderCreator<ArithmeticDynamicFP16Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.h
new file mode 100644
index 00000000..87e43687
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.h
@@ -0,0 +1,132 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ARITHMETIC_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ARITHMETIC_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include <string>
+#include "coder/opcoders/op_coder.h"
+#include "nnacl/base/cast_base.h"
+#include "nnacl/arithmetic_parameter.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h"
+#include "nnacl/broadcast_to_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+using mindspore::schema::PrimitiveType_AddFusion;
+using mindspore::schema::PrimitiveType_DivFusion;
+using mindspore::schema::PrimitiveType_Eltwise;
+using mindspore::schema::PrimitiveType_Equal;
+using mindspore::schema::PrimitiveType_FloorDiv;
+using mindspore::schema::PrimitiveType_FloorMod;
+using mindspore::schema::PrimitiveType_Greater;
+using mindspore::schema::PrimitiveType_GreaterEqual;
+using mindspore::schema::PrimitiveType_Less;
+using mindspore::schema::PrimitiveType_LessEqual;
+using mindspore::schema::PrimitiveType_LogicalAnd;
+using mindspore::schema::PrimitiveType_LogicalOr;
+using mindspore::schema::PrimitiveType_Maximum;
+using mindspore::schema::PrimitiveType_Minimum;
+using mindspore::schema::PrimitiveType_Mod;
+using mindspore::schema::PrimitiveType_MulFusion;
+using mindspore::schema::PrimitiveType_NotEqual;
+using mindspore::schema::PrimitiveType_RealDiv;
+using mindspore::schema::PrimitiveType_SquaredDifference;
+using mindspore::schema::PrimitiveType_SubFusion;
+
+class ArithmeticDynamicFP16Coder final : public OperatorCoder {
+  typedef struct {
+    int primitive_type_;
+    int activation_type_;
+    std::string func_;
+    std::string int_func_;
+    std::string bool_func_;
+    std::string opt_func_;
+    std::string opt_int_func_;
+  } ARITHMETIC_FUNC_INFO_FP16;
+
+  //  typedef struct MATRIC_INFO {
+  //    bool is_const{false};
+  //    bool is_valid{false};
+  //    void *data{nullptr};
+  //    int64_t inner_size{1};  // the element num of once batch
+  //    std::vector<int64_t> shape;
+  //    std::vector<int64_t> batch_post_sum;
+  //    void Reset() {
+  //      is_valid = false;
+  //      data = nullptr;
+  //      inner_size = 1;
+  //      shape.clear();
+  //      batch_post_sum.clear();
+  //    }
+  //  } MATRIC_INFO;
+
+ public:
+  ArithmeticDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                             const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~ArithmeticDynamicFP16Coder() override = default;
+
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  int Prepare(CoderContext *const context) override;
+
+  void InitFunTable();
+
+  void InitRunFunction(int primitive_type);
+
+  void InitDynamicParams();
+
+  void ResetStatus();
+
+  void CalcMultiplesAndStrides();
+
+  void ComputeStrides(const std::vector<std::string> &shape, std::vector<std::string> &strides);
+
+  int ExecuteCode(const std::string &input0, const std::string &input1, const std::string &output,
+                  const std::string size, CoderContext *const context, NNaclFp32Serializer *const code);
+
+  std::vector<ARITHMETIC_FUNC_INFO_FP16> fun_table_;
+  ArithmeticFuncType arithmetic_func_type_{kArithmeticFuncUnknow};
+  ArithmeticParameter *param_{nullptr};
+  ArithmeticDynamicParameter dynamic_param_;
+  BroadcastShapeInfo broadcast_info_;
+  BroadcastDynamicShapeInfo dynamic_shape_info_;
+  Tensor *filter_tensor_{nullptr};
+  std::string input0_ptr_str_;
+  std::string input1_ptr_str_;
+  std::string output_ptr_str_;
+  std::string arithmetic_run_;
+  std::string arithmetic_run_int_;
+  std::string arithmetic_opt_run_;
+  std::string arithmetic_opt_run_int_;
+  std::string arithmetic_run_bool_;
+  std::string arithmetic_func_str_;
+  std::vector<std::string> in0_shape_;
+  std::vector<std::string> in1_shape_;
+  std::vector<std::string> out_shape_;
+  std::vector<std::string> in0_strides_;
+  std::vector<std::string> in1_strides_;
+  std::vector<std::string> out_strides_;
+  //  MATRIC_INFO a_matric_;
+  //  MATRIC_INFO b_matric_;
+  //  MATRIC_INFO c_matric_;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ARITHMETIC_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..bf8bd06b
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.cc
@@ -0,0 +1,92 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.h"
+#include <string>
+#include <vector>
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_Concat;
+
+namespace mindspore::lite::micro::nnacl {
+int ConcatDynamicFP16Coder::Prepare(CoderContext *const context) {
+  for (size_t i = 0; i < input_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(input_tensors_.at(i)->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                      "input tensor data type is invalid.");
+  }
+  concat_param_ = reinterpret_cast<ConcatParameter *>(parameter_);
+  MS_CHECK_PTR(concat_param_);
+  auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  axis_ =
+    concat_param_->axis_ >= 0 ? concat_param_->axis_ : static_cast<int>(input_shape.size()) + concat_param_->axis_;
+  return RET_OK;
+}
+
+int ConcatDynamicFP16Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/base/concat_base.h",
+          },
+          {
+            "concat_base.c",
+          });
+
+  size_t input_num = input_tensors_.size();
+
+  NNaclFp32Serializer code;
+  code << "\t\tvoid *inputs_addr[] = {";
+  for (size_t i = 0; i < input_num; ++i) {
+    code << "(void *)("
+         << GetTensorAddr(input_tensors_.at(i), input_tensors_.at(i)->IsConst(), dynamic_mem_manager_, allocator_)
+         << "), ";
+  }
+  code << "};\n";
+
+  size_t i;
+  for (i = 0; i < input_num; ++i) {
+    code << "\t\tint shape_" << i << "[] = {";
+    auto in_shape = shape_info_container_->GetTemplateShape(input_tensors_.at(i));
+    for (auto &shape : in_shape) {
+      code << shape << ", ";
+    }
+    code << "};\n";
+  }
+
+  auto out_shape = shape_info_container_->GetTemplateShape(output_tensor_);
+  code << "\t\tint shape_" << i << "[] = {";
+  for (auto &shape : out_shape) {
+    code << shape << ", ";
+  }
+  code << "};\n";
+
+  code << "\t\tint *inputs_output_shape[] = {";
+  for (i = 0; i <= input_num; ++i) {
+    code << "shape_" << i << ", ";
+  }
+  code << "};\n";
+  std::string output_data = GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  code.CodeFunction("Concat", "inputs_addr", input_num, axis_, "inputs_output_shape", out_shape.size(), output_data, 0,
+                    1, sizeof(uint16_t));
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Concat, CPUOpCoderCreator<ConcatDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Concat, CPUOpCoderCreator<ConcatDynamicFP16Coder>)
+
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.h
new file mode 100644
index 00000000..bd1b7ff6
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONCAT_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONCAT_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include "coder/opcoders/op_coder.h"
+#include "nnacl/concat_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class ConcatDynamicFP16Coder final : public OperatorCoder {
+ public:
+  ConcatDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                         const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+  ~ConcatDynamicFP16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  int axis_{0};
+  ConcatParameter *concat_param_{nullptr};
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONCAT_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..2f4e42e7
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.cc
@@ -0,0 +1,155 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.h"
+#include "src/common/version_manager.h"
+#include "src/common/tensor_util.h"
+#include "src/common/ops/populate/populate_register.h"
+#include "nnacl/fp32/winograd_utils.h"
+#include "nnacl/base/conv_common_base.h"
+#include "nnacl/infer/conv2d_infer.h"
+#include "coder/shape_info_container.h"
+#include "coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h"
+#include "coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h"
+
+using mindspore::schema::PrimitiveType_Conv2DFusion;
+namespace mindspore::lite::micro::nnacl {
+int ConvDelegateDynamicFP16Coder::Prepare(CoderContext *const context) {
+  for (size_t i = 0; i < input_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                      "Input tensor data type is invalid");
+  }
+  for (size_t i = 0; i < output_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(output_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                      "Output tensor data type is invalid");
+  }
+  // Update shape info of input and output
+  ConvDynamicParameter dynamic_param;
+  SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(parameter_), dynamic_param, input_tensor_, output_tensor_);
+  if (conv_coder_ == nullptr) {
+    // need to select actual execute coder here
+    conv_coder_ =
+      CPUConvFP16DynamicCoderSelect(input_tensors_, output_tensors_, node_, node_index(), target_, schema_version_);
+    MS_CHECK_PTR(conv_coder_);
+    ConvParameter *op_parameter = static_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
+    if (op_parameter == nullptr) {
+      MS_LOG(ERROR) << "malloc ConvParameter failed.";
+      return RET_ERROR;
+    }
+    if (memcpy_s(op_parameter, sizeof(ConvParameter), parameter_, sizeof(ConvParameter)) != EOK) {
+      MS_LOG(ERROR) << "memcpy_s failed.";
+      free(op_parameter);
+      return RET_ERROR;
+    }
+    conv_coder_->set_type(GetPrimitiveType(node_->primitive_, schema_version_));
+    conv_coder_->set_thread_num(thread_num_);
+    conv_coder_->set_parameter(reinterpret_cast<OpParameter *>(op_parameter));
+    conv_coder_->set_shape_info_container(shape_info_container_);
+    conv_coder_->set_dynamic_mem_manager(dynamic_mem_manager_);
+  }
+  return conv_coder_->Prepare(context);
+}
+
+int ConvDelegateDynamicFP16Coder::DoCode(CoderContext *const context) { return conv_coder_->DoCode(context); }
+
+void ConvDelegateDynamicFP16Coder::SetInputOutputShapeInfo(ConvParameter *conv_param,
+                                                           ConvDynamicParameter &dynamic_param,
+                                                           const lite::Tensor *input, const lite::Tensor *output) {
+  dynamic_param.input_batch_ = shape_info_container_->GetTemplateShape(input_tensor_).at(0);
+  conv_param->input_h_ = input->Height();
+  conv_param->input_w_ = input->Width();
+  conv_param->input_channel_ = input->Channel();
+  dynamic_param.output_batch_ = shape_info_container_->GetTemplateShape(output_tensor_).at(0);
+  conv_param->output_h_ = output->Height();
+  conv_param->output_w_ = output->Width();
+  conv_param->output_channel_ = output->Channel();
+}
+
+std::unique_ptr<OperatorCoder> CPUConvFP16DynamicCoderSelect(const std::vector<lite::Tensor *> &in_tensors,
+                                                             const std::vector<lite::Tensor *> &out_tensors,
+                                                             const LiteGraph::Node *node, size_t node_index,
+                                                             Target target, int schema_version) {
+  const void *primitive = node->primitive_;
+  if (primitive == nullptr) {
+    return nullptr;
+  }
+  ParameterGen paramGen = PopulateRegistry::GetInstance()->GetParameterCreator(
+    GetPrimitiveType(node->primitive_, schema_version), schema_version);
+  MS_CHECK_PTR_RET_NULL(paramGen);
+  auto conv_param = reinterpret_cast<ConvParameter *>(paramGen(node->primitive_));
+  MS_CHECK_PTR_RET_NULL(conv_param);
+  int kernel_h = conv_param->kernel_h_;
+  int kernel_w = conv_param->kernel_w_;
+  conv_param->input_h_ = in_tensors.at(kInputIndex)->Height();
+  conv_param->input_w_ = in_tensors.at(kInputIndex)->Width();
+  conv_param->input_channel_ = in_tensors.at(kInputIndex)->Channel();
+  conv_param->output_h_ = out_tensors.at(kOutputIndex)->Height();
+  conv_param->output_w_ = out_tensors.at(kOutputIndex)->Width();
+  conv_param->output_channel_ = out_tensors.at(kOutputIndex)->Channel();
+  conv_param->op_parameter_.thread_num_ = 1;
+  free(conv_param);
+  std::unique_ptr<OperatorCoder> coder;
+  if (kernel_h == 1 && kernel_w == 1) {
+    MS_LOG(DEBUG) << "create Convolution1x1DynamicFP16CPUKernel";
+    coder = CPUOpCoderCreator<Convolution1x1DynamicFP16Coder>(in_tensors, out_tensors, node, node_index, target,
+                                                              schema_version);
+  } else {
+    MS_LOG(DEBUG) << "create ConvolutionDynamicFP16Coder";
+    coder =
+      CPUOpCoderCreator<ConvolutionDynamicFP16Coder>(in_tensors, out_tensors, node, node_index, target, schema_version);
+  }
+  return coder;
+}
+
+std::unique_ptr<OperatorCoder> CreateConvDelegateFp16(const std::vector<lite::Tensor *> &in_tensors,
+                                                      const std::vector<lite::Tensor *> &out_tensors,
+                                                      const LiteGraph::Node *node, size_t node_index, Target target,
+                                                      int schema_version) {
+  return CPUOpCoderCreator<ConvDelegateDynamicFP16Coder>(in_tensors, out_tensors, node, node_index, target,
+                                                         schema_version);
+}
+
+std::unique_ptr<OperatorCoder> CPUConv2DFusionDynamicFP16CoderCreator(const std::vector<lite::Tensor *> &in_tensors,
+                                                                      const std::vector<lite::Tensor *> &out_tensors,
+                                                                      const LiteGraph::Node *node, size_t node_index,
+                                                                      Target target, int schema_version) {
+  const void *primitive = node->primitive_;
+  if (primitive == nullptr) {
+    return nullptr;
+  }
+  ParameterGen param_gen = PopulateRegistry::GetInstance()->GetParameterCreator(
+    GetPrimitiveType(node->primitive_, schema_version), schema_version);
+  if (param_gen == nullptr) {
+    MS_LOG(ERROR) << "parameter generator is null";
+    return nullptr;
+  }
+  auto conv_param = reinterpret_cast<ConvParameter *>(param_gen(node->primitive_));
+  std::unique_ptr<OperatorCoder> coder;
+  if (conv_param->group_ == 1) {
+    coder = CreateConvDelegateFp16(in_tensors, out_tensors, node, node_index, target, schema_version);
+  } else {
+    // GroupConv
+    MS_LOG(ERROR) << "currently, only support conv_param->group_ == 1 in dynamic coder scene";
+    return nullptr;
+  }
+  return coder;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Conv2DFusion,
+                           CPUConv2DFusionDynamicFP16CoderCreator)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Conv2DFusion,
+                           CPUConv2DFusionDynamicFP16CoderCreator)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.h
new file mode 100644
index 00000000..c352c469
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONV2D_DELEGATE_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONV2D_DELEGATE_DYNAMIC_FP16_CODER_H_
+#include <vector>
+#include <memory>
+#include "coder/opcoders/op_coder.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h"
+#include "nnacl/conv_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class ConvDelegateDynamicFP16Coder : public OperatorCoder {
+ public:
+  ConvDelegateDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                               const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~ConvDelegateDynamicFP16Coder() override = default;
+  int Prepare(CoderContext *const context) override;
+  int DoCode(CoderContext *const context) override;
+
+ protected:
+  std::unique_ptr<OperatorCoder> conv_coder_ = nullptr;
+  ConvParameter *conv_param_{nullptr};
+  ConvDynamicParameter dynamic_param_;
+
+ private:
+  void SetInputOutputShapeInfo(ConvParameter *conv_param, ConvDynamicParameter &dynamic_param,
+                               const lite::Tensor *input, const lite::Tensor *output);
+};
+
+std::unique_ptr<OperatorCoder> CPUConvFP16DynamicCoderSelect(const std::vector<lite::Tensor *> &in_tensors,
+                                                             const std::vector<lite::Tensor *> &out_tensors,
+                                                             const LiteGraph::Node *node, size_t node_index,
+                                                             Target target, int schema_version);
+
+std::unique_ptr<OperatorCoder> CPUConv2DFusionDynamicFP16CoderCreator(const std::vector<lite::Tensor *> &in_tensors,
+                                                                      const std::vector<lite::Tensor *> &out_tensors,
+                                                                      const LiteGraph::Node *node, size_t node_index,
+                                                                      Target target, int schema_version);
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONV2D_DELEGATE_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..c682b2ed
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.cc
@@ -0,0 +1,252 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h"
+#include <string>
+#include <vector>
+#include "nnacl/fp32/winograd_utils.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/utils/coder_utils.h"
+
+namespace mindspore::lite::micro::nnacl {
+int Convolution1x1DynamicFP16Coder::Prepare(CoderContext *const context) {
+  CHECK_LESS_RETURN(input_tensors_.size(), C2NUM);
+  CHECK_LESS_RETURN(output_tensors_.size(), 1);
+  for (size_t i = 0; i < input_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                      "Tensor data type is invalid");
+  }
+  for (size_t i = 0; i < output_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(output_tensors_[i]->data_type() == kNumberTypeFloat16, RET_PARAM_INVALID,
+                      "Tensor data type is invalid");
+  }
+  if (target_ == kARM64) {
+    row_tile_ = (output_tensor_->format() == NC4HW4) ? C16NUM : C12NUM;
+    col_tile_ = (output_tensor_->format() == NC4HW4) ? C8NUM : C16NUM;
+  }
+  if (matmul_param_ == nullptr) {
+    matmul_param_ = new (std::nothrow) MatMulParameter();
+    if (matmul_param_ == nullptr) {
+      MS_LOG(ERROR) << "Init matmul_param_ failed.";
+      return RET_ERROR;
+    }
+  }
+  conv_param_ = reinterpret_cast<ConvParameter *>(parameter_);
+  filter_tensor_ = input_tensors_.at(kWeightIndex);
+  MS_CHECK_PTR(filter_tensor_);
+  if (input_tensors_.size() == kInputSize2) {
+    bias_tensor_ = input_tensors_.at(kBiasIndex);
+    MS_CHECK_PTR(bias_tensor_);
+  } else {
+    MS_CHECK_TRUE(input_tensors_.size() == kInputSize1, "wrong input size");
+  }
+  dynamic_param_.input_batch_ = shape_info_container_->GetTemplateShape(input_tensor_)[0];
+  conv_param_->input_h_ = input_tensor_->Height();
+  conv_param_->input_w_ = input_tensor_->Width();
+  conv_param_->input_channel_ = input_tensor_->Channel();
+  dynamic_param_.output_batch_ = shape_info_container_->GetTemplateShape(output_tensor_)[0];
+  conv_param_->output_h_ = output_tensor_->Height();
+  conv_param_->output_w_ = output_tensor_->Width();
+  conv_param_->output_channel_ = output_tensor_->Channel();
+  MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed.");
+  MS_CHECK_RET_CODE(InitMatmulParam(), "Init matmul param failed.");
+  MS_CHECK_RET_CODE(InitTmpBuffer(context), "Init tmp buffer failed.");
+  return RET_OK;
+}
+
+int Convolution1x1DynamicFP16Coder::DoCode(CoderContext *const context) {
+  CollectFilesForFunc(context);
+  NNaclFp32Serializer code;
+  MS_CHECK_RET_CODE(ComputeWorkspace(), "ComputeWorkspace failed.");
+  auto tmp_input_str = "(float16_t *)(" + allocator_->GetRuntimeAddr(static_cast<float16 *>(tmp_input_)) + ")";
+  auto input_str =
+    "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  auto output_str =
+    "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  auto packed_weight_str = allocator_->GetRuntimeAddr(static_cast<float16 *>(packed_weight_));
+
+  code << "  for (int batch_index = 0; batch_index < " << dynamic_param_.input_batch_ << "; batch_index++) {\n";
+  output_ptr_ = output_str + " + batch_index * " + std::to_string(matmul_param_->row_ * matmul_param_->col_);
+  auto batch_in = input_str + " + batch_index * " +
+                  std::to_string(conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_);
+  if (pre_trans_input_) {
+    code.CodeStruct("conv_parameter", *conv_param_, dynamic_param_);
+    code.CodeFunction("Conv1x1InputPack", batch_in, tmp_input_str, "&conv_parameter", DataTypeSize(data_type_));
+  } else {
+    tmp_input_str = batch_in;
+  }
+
+  if (output_tensor_->format() == NC4HW4) {
+    code.CodeFunction(target_ == kARM64 ? "RowMajor2Col16MajorFp16Opt" : "RowMajor2Col12MajorFp16Opt", tmp_input_str,
+                      "(float16_t *)(" + pack_input_str_ + ")", matmul_param_->row_, matmul_param_->deep_);
+  } else {
+    code.CodeFunction("RowMajor2Col12MajorFp16Opt", tmp_input_str, "(float16_t *)(" + pack_input_str_ + ")",
+                      matmul_param_->row_, matmul_param_->deep_);
+  }
+
+  if (output_tensor_->format() == NC4HW4) {
+    code.CodeStruct("matmul_param", *matmul_param_);
+    code.CodeFunction("Conv1x1OutNc8hw8MultiThreadByWeightFp16", tmp_input_str,
+                      "(float16_t *)(" + pack_input_str_ + ")", packed_weight_str, bias_data_, output_ptr_,
+                      kDefaultTaskId, "&matmul_param");
+  } else {
+    code.CodeFunction(target_ == kARM64 ? "MatMul12x16Fp16Opt" : "MatMul12x8A32Fp16",
+                      "(float16_t *)(" + pack_input_str_ + ")", packed_weight_str, output_ptr_, bias_data_,
+                      matmul_param_->act_type_, matmul_param_->deep_, matmul_param_->row_, matmul_param_->col_,
+                      matmul_param_->col_, OutType_Nhwc);
+  }
+  code << "  }\n";
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+Convolution1x1DynamicFP16Coder::~Convolution1x1DynamicFP16Coder() {
+  FreeTmpBuffer();
+  if (matmul_param_ != nullptr) {
+    delete matmul_param_;
+    matmul_param_ = nullptr;
+  }
+  return;
+}
+
+void Convolution1x1DynamicFP16Coder::FreeTmpBuffer() {
+  if (pre_trans_input_ && tmp_input_ != nullptr) {
+    free(tmp_input_);
+    tmp_input_ = nullptr;
+  }
+  return;
+}
+
+int Convolution1x1DynamicFP16Coder::ComputeWorkspace() {
+  pack_input_size_ = matmul_param_->row_align_ * matmul_param_->deep_ * DataTypeSize(data_type_);
+  auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  size_t scene_num = 0;
+  for (auto &dim_template : input_shape) {
+    auto dim_nums = shape_info_container_->GetRealNums(dim_template);
+    MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0.");
+    scene_num = std::max(scene_num, dim_nums.size());
+  }
+  for (size_t i = 0; i < scene_num; ++i) {
+    pack_input_str_ = dynamic_mem_manager_->AllocWorkSpace(pack_input_size_, i);
+    MS_CHECK_TRUE_MSG(!pack_input_str_.empty(), RET_ERROR, "Convolution cannot alloc workspace.");
+  }
+  return RET_OK;
+}
+
+int Convolution1x1DynamicFP16Coder::InitMatmulParam() {
+  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
+  matmul_param_->col_ = conv_param_->output_channel_;
+  matmul_param_->deep_ = conv_param_->input_channel_;
+  matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
+  matmul_param_->col_align_ = UP_ROUND(matmul_param_->col_, col_tile_);
+  matmul_param_->act_type_ = conv_param_->act_type_;
+  return RET_OK;
+}
+
+int Convolution1x1DynamicFP16Coder::InitWeightBias(CoderContext *const context) {
+  auto input_channel = filter_tensor_->Channel();
+  auto output_channel = filter_tensor_->Batch();
+  MS_CHECK_TRUE_RET(input_channel > 0 && output_channel > 0, RET_ERROR);
+  pack_weight_size_ = input_channel * UP_ROUND(output_channel, col_tile_) * DataTypeSize(data_type_);
+  packed_weight_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight);
+  MS_CHECK_PTR(packed_weight_);
+
+  NNaclFp32Serializer init_code;
+  std::string ori_weight_addr = allocator_->GetRuntimeAddr(filter_tensor_);
+  size_t w_buf_size = 0;
+  w_buf_size += pack_weight_size_;
+  auto packed_weight_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast<float16 *>(packed_weight_));
+  init_code.CodeBufferOffsetExpression(packed_weight_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), pack_weight_size_);
+  if (target_ == kARM64 && output_tensor_->format() != NC4HW4) {
+    init_code.CodeFunction("RowMajor2Col16MajorFp16Opt", ori_weight_addr, packed_weight_str, output_channel,
+                           input_channel);
+  } else {
+    init_code.CodeFunction("ColMajor2Row8MajorFp16", ori_weight_addr, packed_weight_str, input_channel, output_channel,
+                           true);
+  }
+  bias_data_size_ = UP_ROUND(output_channel, col_tile_) * DataTypeSize(data_type_);
+  if (input_tensors_.size() == kInputSize2) {
+    bias_data_ =
+      allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, bias_tensor_->tensor_name() + "_online_pack");
+    MS_CHECK_PTR(bias_data_);
+    init_code.CodeBufferOffsetExpression(bias_data_, context->weight_name(), context->weight_offset_name(),
+                                         context->weight_size_name(), bias_data_size_);
+    w_buf_size += bias_data_size_;
+    auto bias_data_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast<float16 *>(bias_data_));
+    std::string bias_tensor_str = allocator_->GetRuntimeAddr(bias_tensor_);
+    init_code.CodeFunction("memcpy", bias_data_str, bias_tensor_str, bias_tensor_->Size());
+  } else {
+    bias_data_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, node_->name_ + "_bias_online_pack");
+    MS_CHECK_PTR(bias_data_);
+    init_code.CodeFunction("memset", bias_data_, 0, bias_data_size_);
+  }
+  context->AppendInitWeightSizeCode(w_buf_size);
+  context->AppendInitCode(init_code.str());
+  return RET_OK;
+}
+
+int Convolution1x1DynamicFP16Coder::InitTmpBuffer(CoderContext *const context) {
+  NNaclFp32Serializer init_code;
+  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
+                      conv_param_->stride_w_ != 1);
+  size_t w_size = 0;
+  if (pre_trans_input_) {
+    tmp_input_size_ = matmul_param_->row_ * matmul_param_->deep_ * DataTypeSize(data_type_);
+    tmp_input_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight);
+    MS_CHECK_PTR(tmp_input_);
+    w_size += tmp_input_size_;
+    auto tmp_input_str = allocator_->GetRuntimeAddr(static_cast<float16 *>(tmp_input_));
+    init_code.CodeBufferOffsetExpression(tmp_input_, context->weight_name(), context->weight_offset_name(),
+                                         context->weight_size_name(), tmp_input_size_);
+    init_code.CodeFunction("memset", tmp_input_, 0, tmp_input_size_);
+  }
+  context->AppendInitWeightSizeCode(w_size);
+  context->AppendInitCode(init_code.str());
+  return RET_OK;
+}
+
+void Convolution1x1DynamicFP16Coder::CollectFilesForFunc(CoderContext *const context) {
+  if (target_ == kARM64) {
+    Collect(context, {}, {},
+            {
+              "MatmulFp16.S",
+              "MatmulFp16Opt.S",
+              "Matmul12X16Fp16.S",
+            });
+  } else {
+    Collect(context, {}, {},
+            {
+              "Matmul12x8Fp16.S",
+            });
+  }
+  Collect(context,
+          {
+            "nnacl/fp16/matmul_fp16.h",
+            "nnacl/conv_parameter.h",
+            "nnacl/op_base.h",
+            "nnacl/fp16/conv_fp16.h",
+            "nnacl/base/conv1x1_base.h",
+          },
+          {
+            "common_func.c",
+            "matmul_fp16.c",
+            "conv_fp16.c",
+            "conv1x1_base.c",
+          });
+}
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h
new file mode 100644
index 00000000..558eea53
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_1X1_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_1X1_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include <string>
+#include "nnacl/conv_parameter.h"
+#include "nnacl/matmul_parameter.h"
+#include "coder/opcoders/op_coder.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h"
+#include "base/float16.h"
+
+namespace mindspore::lite::micro::nnacl {
+class Convolution1x1DynamicFP16Coder final : public OperatorCoder {
+ public:
+  Convolution1x1DynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                                 const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+  ~Convolution1x1DynamicFP16Coder() override;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  void CollectFilesForFunc(CoderContext *const context);
+  int InitWeightBias(CoderContext *const context);
+  int InitMatmulParam();
+  int InitTmpBuffer(CoderContext *const context);
+  void FreeTmpBuffer();
+  int ComputeWorkspace();
+  MatMulParameter *matmul_param_{nullptr};
+  ConvParameter *conv_param_{nullptr};
+  ConvDynamicParameter dynamic_param_;
+  Tensor *filter_tensor_{nullptr};
+  Tensor *bias_tensor_{nullptr};
+  int row_tile_{C12NUM};
+  int col_tile_{C8NUM};
+  void *packed_weight_{nullptr};
+  void *bias_data_{nullptr};
+  std::string pack_input_str_;
+  void *tmp_input_{nullptr};
+  size_t pack_weight_size_{0};
+  size_t bias_data_size_{0};
+  size_t tmp_input_size_{0};
+  size_t pack_input_size_{0};
+  bool pre_trans_input_{false};
+  std::string output_ptr_;
+  TypeId data_type_ = kNumberTypeFloat16;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_1X1_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..c917b89a
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.cc
@@ -0,0 +1,172 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h"
+#include <string>
+#include <vector>
+#include "coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h"
+#include "nnacl/fp32/winograd_utils.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/log.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/utils/coder_utils.h"
+#include "base/float16.h"
+
+using mindspore::schema::PrimitiveType_Conv2DFusion;
+namespace mindspore::lite::micro::nnacl {
+int ConvolutionDynamicFP16Coder::Prepare(CoderContext *const context) {
+  CHECK_LESS_RETURN(input_tensors_.size(), C2NUM);
+  CHECK_LESS_RETURN(output_tensors_.size(), 1);
+  if (target_ == kARM64) {
+    row_tile_ = C16NUM;
+  }
+  conv_param_ = reinterpret_cast<ConvParameter *>(parameter_);
+  MS_CHECK_PTR(conv_param_);
+  dynamic_param_.input_batch_ = shape_info_container_->GetTemplateShape(input_tensor_)[0];
+  conv_param_->input_h_ = input_tensor_->Height();
+  conv_param_->input_w_ = input_tensor_->Width();
+  conv_param_->input_channel_ = input_tensor_->Channel();
+  dynamic_param_.output_batch_ = shape_info_container_->GetTemplateShape(output_tensor_)[0];
+  conv_param_->output_h_ = output_tensor_->Height();
+  conv_param_->output_w_ = output_tensor_->Width();
+  conv_param_->output_channel_ = output_tensor_->Channel();
+  conv_param_->thread_num_ = 1;
+  MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed.");
+  MS_CHECK_RET_CODE(InitTmpBuffer(), "Init tmp buffer failed.");
+  return RET_OK;
+}
+
+int ConvolutionDynamicFP16Coder::InitTmpBuffer() {
+  int uint_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * row_tile_ *
+                  conv_param_->thread_num_;
+  packed_input_size_ = uint_size * DataTypeSize(data_type_);
+  auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  size_t scene_num = 0;
+  for (auto &dim_template : input_shape) {
+    auto dim_nums = shape_info_container_->GetRealNums(dim_template);
+    MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0.");
+    scene_num = std::max(scene_num, dim_nums.size());
+  }
+  for (size_t i = 0; i < scene_num; ++i) {
+    packed_input_str_ = dynamic_mem_manager_->AllocWorkSpace(packed_input_size_ * 2, i);
+    MS_CHECK_TRUE_MSG(!packed_input_str_.empty(), RET_ERROR, "Convolution cannot alloc workspace.");
+  }
+  col_major_input_str_ = packed_input_str_ + " + " + std::to_string(packed_input_size_);
+  return RET_OK;
+}
+
+int ConvolutionDynamicFP16Coder::InitWeightBias(CoderContext *const context) {
+  filter_tensor_ = input_tensors_.at(kWeightIndex);
+  CHECK_NULL_RETURN(filter_tensor_);
+  auto shape = filter_tensor_->shape();
+  if (std::find(shape.begin(), shape.end(), -1) != shape.end()) {
+    MS_LOG(WARNING) << "The shape of weight tensor is not ready, the weight and bias would be inited in runtime.";
+    return RET_OK;
+  }
+  int in_channel = filter_tensor_->Channel();
+  int out_channel = filter_tensor_->Batch();
+  MS_CHECK_TRUE_RET(in_channel > 0 && out_channel > 0, RET_ERROR);
+  conv_param_->input_channel_ = in_channel;
+  conv_param_->output_channel_ = out_channel;
+  int oc8 = UP_ROUND(out_channel, col_tile_);
+  int kernel_plane = filter_tensor_->Height() * filter_tensor_->Width();
+  pack_weight_size_ = oc8 * in_channel * kernel_plane * DataTypeSize(data_type_);
+  // init weight
+  packed_weight_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight);
+  MS_CHECK_PTR(packed_weight_);
+  NNaclFp32Serializer init_code;
+  std::string ori_weight_addr = allocator_->GetRuntimeAddr(filter_tensor_);
+  size_t w_buf_size = 0;
+  w_buf_size += pack_weight_size_;
+  auto packed_weight_str = allocator_->GetRuntimeAddr(static_cast<float16 *>(packed_weight_));
+  init_code.CodeBufferOffsetExpression(packed_weight_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), pack_weight_size_);
+  init_code.CodeFunction("RowMajor2Col8MajorFp16", ori_weight_addr, packed_weight_str, out_channel,
+                         in_channel * kernel_plane, false);
+  if (input_tensors_.size() == C3NUM) {
+    bias_tensor_ = input_tensors_.at(kBiasIndex);
+    MS_CHECK_PTR(bias_tensor_);
+    bias_data_ =
+      allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, bias_tensor_->tensor_name() + "_online_pack");
+    MS_CHECK_PTR(bias_data_);
+  } else {
+    bias_data_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, node_->name_ + "_bias_online_pack");
+    MS_CHECK_PTR(bias_data_);
+  }
+  auto bias_data_size = static_cast<size_t>(oc8 * DataTypeSize(data_type_));
+  w_buf_size += bias_data_size;
+  init_code.CodeBufferOffsetExpression(bias_data_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), bias_data_size);
+  bias_data_str_ = allocator_->GetRuntimeAddr(bias_data_);
+  if (input_tensors_.size() == C3NUM) {
+    auto origin_bias_str = allocator_->GetRuntimeAddr(bias_tensor_);
+    init_code.CodeFunction("memcpy", bias_data_str_, origin_bias_str, bias_tensor_->Size());
+  } else {
+    init_code.CodeFunction("memset", bias_data_str_, 0, bias_data_size);
+  }
+  context->AppendInitWeightSizeCode(w_buf_size);
+  context->AppendInitCode(init_code.str());
+  return RET_OK;
+}
+
+void ConvolutionDynamicFP16Coder::CollectFilesForFunc(CoderContext *const context) {
+  Collect(context, {}, {},
+          {
+            "MatmulFp16.S",
+            "MatmulFp16Opt.S",
+            "MatVecMulFp16.S",
+            "Matmul12X16Fp16.S",
+          });
+  Collect(context,
+          {
+            "nnacl/fp16/matmul_fp16.h",
+            "nnacl/conv_parameter.h",
+            "nnacl/op_base.h",
+            "nnacl/fp16/conv_fp16.h",
+          },
+          {
+            "common_func.c",
+            "matmul_fp16.c",
+            "pack_fp16.c",
+            "conv_fp16.c",
+          });
+}
+
+int ConvolutionDynamicFP16Coder::DoCode(CoderContext *const context) {
+  CollectFilesForFunc(context);
+  NNaclFp32Serializer code;
+  // call the op function
+  auto packed_weight_str = allocator_->GetRuntimeAddr(static_cast<float16 *>(packed_weight_));
+  auto input_str =
+    "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  auto output_str =
+    "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  //  code.CodeFunction("memset", packed_input_str_, "0", packed_input_size_);
+  //  code.CodeFunction("memset", col_major_input_str_, "0", packed_input_size_);
+  code.CodeStruct("conv_parameter", *conv_param_, dynamic_param_);
+  packed_input_str_ = "(float16_t *)(" + packed_input_str_ + ")";
+  col_major_input_str_ = "(float16_t *)(" + col_major_input_str_ + ")";
+  if (output_tensor_->format() == NC4HW4) {
+    code.CodeFunction("ConvOutNc8hw8Fp16", input_str, packed_input_str_, packed_weight_str, bias_data_str_,
+                      col_major_input_str_, output_str, kDefaultTaskId, "&conv_parameter");
+  } else {
+    code.CodeFunction("ConvFp16", input_str, packed_input_str_, packed_weight_str, bias_data_str_, col_major_input_str_,
+                      output_str, kDefaultTaskId, "&conv_parameter");
+  }
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h
new file mode 100644
index 00000000..29d70796
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include <string>
+#include "nnacl/conv_parameter.h"
+#include "coder/opcoders/op_coder.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class ConvolutionDynamicFP16Coder final : public OperatorCoder {
+ public:
+  ConvolutionDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                              const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~ConvolutionDynamicFP16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  void CollectFilesForFunc(CoderContext *const context);
+  int InitWeightBias(CoderContext *const context);
+  int InitTmpBuffer();
+  ConvParameter *conv_param_{nullptr};
+  ConvDynamicParameter dynamic_param_;
+  TypeId data_type_{kNumberTypeFloat16};
+  int row_tile_{C12NUM};
+  int col_tile_{C8NUM};
+  Tensor *filter_tensor_{nullptr};
+  Tensor *bias_tensor_{nullptr};
+  size_t pack_weight_size_{0};
+  size_t packed_input_size_{0};
+  void *packed_weight_{nullptr};
+  void *bias_data_{nullptr};
+  std::string packed_input_str_;
+  std::string col_major_input_str_;
+  std::string bias_data_str_;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..8c4cc31b
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.cc
@@ -0,0 +1,366 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.h"
+#include <cfloat>
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/utils/coder_utils.h"
+#include "tools/common/string_util.h"
+
+using mindspore::schema::PrimitiveType_LSTM;
+
+namespace mindspore::lite::micro::nnacl {
+namespace {
+constexpr size_t kMindirInputTensorNum = 4;
+}  // namespace
+
+int LstmMindirDynamicFP16Coder::Prepare(CoderContext *const context) {
+  CHECK_NULL_RETURN(context);
+  CHECK_NOT_EQUAL_RETURN(input_tensors_.size(), kMindirInputTensorNum);
+  for (auto in : input_tensors_) {
+    MS_CHECK_TRUE_MSG(in != nullptr, RET_INPUT_TENSOR_ERROR, "LstmMindirDynamicFP16Coder input is a nullptr.");
+    MS_CHECK_TRUE_MSG(in->data_type() == kNumberTypeFloat16, RET_INPUT_TENSOR_ERROR,
+                      "LstmMindirDynamicFP16Coder input must be fp16.");
+    MS_CHECK_TRUE_MSG(in->shape().size() == C3NUM, RET_INPUT_TENSOR_ERROR,
+                      "LstmMindirDynamicFP16Coder input must be 3D.");
+  }
+  MS_CHECK_TRUE_MSG(input_tensors_[FOURTH_INPUT]->IsConst(), RET_INPUT_TENSOR_ERROR,
+                    "LstmMindirDynamicFP16Coder last three inputs must be all constant.");
+  lstm_param_ = reinterpret_cast<LstmParameter *>(parameter_);
+  return InitParam();
+}
+
+int LstmMindirDynamicFP16Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/lstm_parameter.h",
+            "nnacl/fp16/lstm_fp16.h",
+          },
+          {"lstm_fp16.c", "activation_fp16.c", "arithmetic_fp16.c", "matmul_fp16.c", "pack_fp16.c"},
+          {"MatmulBaseFp16Neon.S"});
+
+  auto ret = InitInputWeightBias(context);
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Lstm InitInputWeightBias failed.");
+  ret = InitStateWeightBias(context);
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Lstm InitStateWeightBias failed.");
+  ret = InitProjectWeight(context);
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Lstm InitProjectWeight failed.");
+  ret = ComputeWorkSpace();
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Lstm ComputeWorkSpace failed.");
+  CreateBufferAddrStr();
+  NNaclFp32Serializer code;
+  code << "float16_t *buffer[7] = {";
+  for (const auto &buf : buffers_str_) {
+    code << "(float16_t *)(" << buf << "), ";
+  }
+  code << "};\n";
+
+  auto input1 = dynamic_mem_manager_->GetVarTensorAddr(input_tensors_[FIRST_INPUT]);
+  auto hidden_init = input_tensors_[SECOND_INPUT]->IsConst()
+                       ? allocator_->GetRuntimeAddr(input_tensors_[SECOND_INPUT], true)
+                       : dynamic_mem_manager_->GetVarTensorAddr(input_tensors_[SECOND_INPUT]);
+  auto cell_init = input_tensors_[THIRD_INPUT]->IsConst()
+                     ? allocator_->GetRuntimeAddr(input_tensors_[THIRD_INPUT], true)
+                     : dynamic_mem_manager_->GetVarTensorAddr(input_tensors_[THIRD_INPUT]);
+  auto output1 = dynamic_mem_manager_->GetVarTensorAddr(output_tensors_[FIRST_INPUT]);
+  auto hidden_output = dynamic_mem_manager_->GetVarTensorAddr(output_tensors_[SECOND_INPUT]);
+  auto cell_output = dynamic_mem_manager_->GetVarTensorAddr(output_tensors_[THIRD_INPUT]);
+  MS_CHECK_TRUE_MSG(!input1.empty() && !hidden_init.empty() && !cell_init.empty() && !output1.empty() &&
+                      !hidden_output.empty() && !cell_output.empty(),
+                    RET_ERROR, "Lstm cannot get addr.");
+  code.CodeStruct("lstm_param", *lstm_param_, dynamic_lstm_param_);
+  auto input_shape2 = shape_info_container_->GetTemplateShape(input_tensors_[SECOND_INPUT]);
+  int64_t const_part = 1;
+  std::string non_const_part;
+  for (const auto &item : input_shape2) {
+    if (IsNumber(item)) {
+      const_part *= std::stoi(item);
+    } else {
+      if (!non_const_part.empty()) {
+        non_const_part += " * ";
+      }
+      non_const_part += item;
+    }
+  }
+  code.CodeFunction("memcpy", hidden_output, hidden_init,
+                    non_const_part + " * " + std::to_string(const_part * DataTypeSize(kNumberTypeFloat16)));
+  auto input_shape3 = shape_info_container_->GetTemplateShape(input_tensors_[THIRD_INPUT]);
+  const_part = 1;
+  non_const_part = "";
+  for (const auto &item : input_shape3) {
+    if (IsNumber(item)) {
+      const_part *= std::stoi(item);
+    } else {
+      if (!non_const_part.empty()) {
+        non_const_part += " * ";
+      }
+      non_const_part += item;
+    }
+  }
+  code.CodeFunction("memcpy", cell_output, cell_init,
+                    non_const_part + " * " + std::to_string(const_part * DataTypeSize(kNumberTypeFloat16)));
+  auto weight_i_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast<float16 *>(weight_i_ptr_));
+  auto weight_h_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast<float16 *>(weight_h_ptr_));
+  auto weight_pro_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast<float16 *>(weight_project_ptr_));
+  auto input_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast<float16 *>(input_bias_));
+  auto state_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast<float16 *>(hh_bias_));
+  auto pro_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast<float16 *>(project_bias_));
+
+  code.CodeFunction("LstmFp16", "(float16_t *)(" + output1 + ")", "(float16_t *)(" + input1 + ")", weight_i_str,
+                    weight_h_str, input_bias_str, state_bias_str, weight_pro_str, pro_bias_str,
+                    "(float16_t *)(" + hidden_output + ")", "(float16_t *)(" + cell_output + ")", "buffer",
+                    "&lstm_param");
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+int LstmMindirDynamicFP16Coder::InitParam() {
+  auto in_shape1 = shape_info_container_->GetTemplateShape(input_tensors_[FIRST_INPUT]);
+  MS_CHECK_TRUE_MSG(in_shape1.size() == C3NUM, RET_INPUT_TENSOR_ERROR, "LstmMindir first input's dim must be 3D.");
+  dynamic_lstm_param_.batch_ = in_shape1[1];
+  dynamic_lstm_param_.seq_len_ = in_shape1[0];
+  MS_CHECK_TRUE_MSG(IsNumber(in_shape1[C2NUM]), RET_NOT_SUPPORT,
+                    "LstmMindir doesn't support input_size is dynamical in micro.");
+  lstm_param_->input_size_ = std::atoi(in_shape1[C2NUM].c_str());
+
+  auto h_init_shape = input_tensors_[SECOND_INPUT]->shape();
+  auto c_init_shape = input_tensors_[THIRD_INPUT]->shape();
+  lstm_param_->hidden_size_ = c_init_shape.back();
+  lstm_param_->output_size_ = h_init_shape.back();
+
+  lstm_param_->output_step_ = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->output_size_
+                                                          : lstm_param_->batch_ * lstm_param_->output_size_;
+  weight_segment_num_ = lstm_param_->bidirectional_ ? C8NUM : C4NUM;
+  dynamic_lstm_param_.input_row_align_ =
+    "(" + dynamic_lstm_param_.batch_ + " * " + dynamic_lstm_param_.seq_len_ + " + 3) / 4 * 4";
+  lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C4NUM);
+
+  dynamic_lstm_param_.state_row_align_ = "(" + dynamic_lstm_param_.batch_ + " + 3) / 4 * 4";
+  lstm_param_->state_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C4NUM);
+  lstm_param_->proj_col_align_ = UP_ROUND(lstm_param_->project_size_, C4NUM);
+  dynamic_lstm_param_.output_step_ =
+    std::to_string((lstm_param_->bidirectional_ ? C2NUM : C1NUM) * lstm_param_->output_size_) + " * " +
+    dynamic_lstm_param_.batch_;
+  size_t scale = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
+  hi_size_ = scale * C4NUM * lstm_param_->hidden_size_ * lstm_param_->input_size_;
+  hh_size_ = scale * C4NUM * lstm_param_->hidden_size_ * lstm_param_->output_size_;
+  hp_size_ = scale * lstm_param_->project_size_ * lstm_param_->hidden_size_;
+  bias_size_ = scale * C8NUM * lstm_param_->hidden_size_;
+  auto real_whole_size = input_tensors_[FOURTH_INPUT]->ElementsNum();
+  gpu_state_ = (hi_size_ + hh_size_ + hp_size_ + bias_size_) == static_cast<size_t>(real_whole_size);
+  if (gpu_state_) {
+    MS_LOG(ERROR) << "LstmMindirDynamicFP16Coder doesn't suuport model which exported from GPU.";
+    return RET_NOT_SUPPORT;
+  }
+  if (hi_size_ + hh_size_ + hp_size_ == static_cast<size_t>(real_whole_size)) {
+    bias_size_ = 0;
+    return RET_OK;
+  }
+  bias_size_ /= C2NUM;
+  if ((hi_size_ + hh_size_ + hp_size_ + bias_size_) != static_cast<size_t>(real_whole_size)) {
+    MS_LOG(ERROR) << "Bias of LstmMindir exported from cpu  only exist in hi-part.";
+    return RET_INPUT_TENSOR_ERROR;
+  }
+  return RET_OK;
+}
+
+int LstmMindirDynamicFP16Coder::InitInputWeightBias(CoderContext *const context) {
+  NNaclFp32Serializer init_code;
+
+  size_t weight_hi_size =
+    weight_segment_num_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * DataTypeSize(data_type_);
+  weight_i_ptr_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight);
+  MS_CHECK_PTR(weight_i_ptr_);
+
+  size_t w_buf_size = 0;
+
+  init_code.CodeBufferOffsetExpression(weight_i_ptr_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), weight_hi_size);
+  w_buf_size += weight_hi_size;
+  auto weight_i_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(input_tensors_[FOURTH_INPUT]);
+  MS_CHECK_TRUE_MSG(!weight_i_str.empty(), RET_INPUT_TENSOR_ERROR, "Lstm cannot get weight.");
+  auto packed_weight_i_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast<float16 *>(weight_i_ptr_));
+  init_code << "  int32_t order[4] = {0, 2, 3, 1};\n";
+  init_code.CodeFunction("PackLstmWeightFp16", packed_weight_i_str, weight_i_str, weight_segment_num_,
+                         lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_, "order");
+
+  auto bias_stride = hi_size_ + hh_size_ + hp_size_;
+  input_bias_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight);
+  MS_CHECK_PTR(input_bias_);
+  size_t bias_i_size = weight_segment_num_ * lstm_param_->input_col_align_ * DataTypeSize(data_type_);
+  w_buf_size += bias_i_size;
+  init_code.CodeBufferOffsetExpression(input_bias_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), bias_i_size);
+  auto input_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast<float16 *>(input_bias_));
+  init_code.CodeFunction("memset", input_bias_str, 0, bias_i_size);
+  if (bias_size_ != 0) {
+    init_code.CodeFunction("PackLstmBiasFp16", input_bias_str, weight_i_str + " + " + std::to_string(bias_stride),
+                           weight_segment_num_, lstm_param_->hidden_size_, lstm_param_->input_col_align_,
+                           lstm_param_->bidirectional_, "order");
+  }
+
+  context->AppendInitWeightSizeCode(w_buf_size);
+  context->AppendInitCode(init_code.str());
+  return RET_OK;
+}
+
+int LstmMindirDynamicFP16Coder::InitStateWeightBias(CoderContext *const context) {
+  NNaclFp32Serializer init_code;
+
+  size_t weight_hh_size =
+    weight_segment_num_ * lstm_param_->state_col_align_ * lstm_param_->project_size_ * DataTypeSize(data_type_);
+  weight_h_ptr_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight);
+  MS_CHECK_PTR(weight_h_ptr_);
+
+  size_t w_buf_size = 0;
+
+  init_code.CodeBufferOffsetExpression(weight_h_ptr_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), weight_hh_size);
+  w_buf_size += weight_hh_size;
+  auto weight_hh_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(input_tensors_[FOURTH_INPUT]);
+  MS_CHECK_TRUE_MSG(!weight_hh_str.empty(), RET_INPUT_TENSOR_ERROR, "Lstm cannot get weight.");
+  auto packed_weight_hh_str =
+    MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast<float16 *>(weight_h_ptr_));
+  init_code << "  int32_t order[4] = {0, 2, 3, 1};\n";
+  init_code.CodeFunction("PackLstmWeightFp16", packed_weight_hh_str, weight_hh_str + " + " + std::to_string(hi_size_),
+                         weight_segment_num_, lstm_param_->project_size_, lstm_param_->hidden_size_,
+                         lstm_param_->state_col_align_, "order");
+
+  hh_bias_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight);
+  MS_CHECK_PTR(hh_bias_);
+  size_t bias_hh_size = weight_segment_num_ * lstm_param_->state_col_align_ * DataTypeSize(data_type_);
+  w_buf_size += bias_hh_size;
+  init_code.CodeBufferOffsetExpression(hh_bias_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), bias_hh_size);
+  auto hh_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast<float16 *>(hh_bias_));
+  init_code.CodeFunction("memset", hh_bias_str, 0, bias_hh_size);
+
+  context->AppendInitWeightSizeCode(w_buf_size);
+  context->AppendInitCode(init_code.str());
+  return RET_OK;
+}
+
+int LstmMindirDynamicFP16Coder::InitProjectWeight(CoderContext *const context) {
+  if (hp_size_ == 0) {
+    return RET_OK;
+  }
+
+  NNaclFp32Serializer init_code;
+  size_t w_buf_size = 0;
+  int scale = lstm_param_->bidirectional_ ? C2NUM : C1NUM;
+  int col_align = UP_ROUND(lstm_param_->project_size_, C8NUM);
+  size_t weight_pro_size = scale * lstm_param_->hidden_size_ * col_align * DataTypeSize(data_type_);
+  weight_project_ptr_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight);
+  MS_CHECK_PTR(weight_project_ptr_);
+  init_code.CodeBufferOffsetExpression(weight_project_ptr_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), weight_pro_size);
+  w_buf_size += weight_pro_size;
+  auto weight_hp_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(input_tensors_[FOURTH_INPUT]);
+  MS_CHECK_TRUE_MSG(!weight_hp_str.empty(), RET_INPUT_TENSOR_ERROR, "Lstm cannot get weight.");
+  auto weight_pro_str =
+    MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast<float16 *>(weight_project_ptr_));
+  init_code.CodeFunction("PackLstmWeightFp16", weight_pro_str,
+                         weight_hp_str + " + " + std::to_string(hi_size_ + hh_size_), scale, lstm_param_->hidden_size_,
+                         lstm_param_->project_size_, col_align, "NULL");
+
+  size_t bias_pro_size = col_align * DataTypeSize(data_type_);
+  project_bias_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight);
+  MS_CHECK_PTR(project_bias_);
+  init_code.CodeBufferOffsetExpression(project_bias_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), bias_pro_size);
+  w_buf_size += bias_pro_size;
+  auto bias_pro_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast<float16 *>(project_bias_));
+  init_code.CodeFunction("memset", bias_pro_str, 0, bias_pro_size);
+
+  context->AppendInitWeightSizeCode(w_buf_size);
+  context->AppendInitCode(init_code.str());
+  return RET_OK;
+}
+
+int LstmMindirDynamicFP16Coder::ComputeWorkSpace() {
+  auto in_shape1 = shape_info_container_->GetTemplateShape(input_tensors_[FIRST_INPUT]);
+  auto seq_lens = shape_info_container_->GetRealNums(in_shape1[0]);
+  MS_CHECK_TRUE_MSG(!seq_lens.empty(), RET_ERROR, "Lstm cannot get seq_len");
+  auto batches = shape_info_container_->GetRealNums(in_shape1[1]);
+  MS_CHECK_TRUE_MSG(!batches.empty(), RET_ERROR, "Lstm cannot get batch");
+  size_t scene_num = seq_lens.size() > batches.size() ? seq_lens.size() : batches.size();
+  for (size_t i = 0; i < scene_num; ++i) {
+    int seq_len = seq_lens[i % seq_lens.size()];
+    int batch = batches[i % batches.size()];
+    size_t buffer1 =
+      seq_len * batch <= C3NUM ? 0 : seq_len * batch * lstm_param_->input_size_ * DataTypeSize(data_type_);
+    size_t buffer2 = C4NUM * seq_len * batch * lstm_param_->hidden_size_ * DataTypeSize(data_type_);
+    size_t buffer3 = batch <= C3NUM ? 0 : batch * lstm_param_->output_size_ * DataTypeSize(data_type_);
+    size_t buffer4 = C4NUM * batch * lstm_param_->hidden_size_ * DataTypeSize(data_type_);
+    size_t buffer5 = (lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON)
+                       ? 0
+                       : batch * lstm_param_->hidden_size_ * DataTypeSize(data_type_);
+    size_t buffer6 = (lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON)
+                       ? 0
+                       : batch * lstm_param_->output_size_ * DataTypeSize(data_type_);
+    size_t buffer7 = (batch <= C3NUM || lstm_param_->project_size_ == 0)
+                       ? 0
+                       : batch * lstm_param_->hidden_size_ * DataTypeSize(data_type_);
+    auto whole_size = buffer1 + buffer2 + buffer3 + buffer4 + buffer5 + buffer6 + buffer7;
+    buffers_start_ = dynamic_mem_manager_->AllocWorkSpace(whole_size, i);
+    MS_CHECK_TRUE_MSG(!buffers_start_.empty(), RET_ERROR, "Lstm cannot alloc workspace.");
+  }
+
+  return RET_OK;
+}
+
+void LstmMindirDynamicFP16Coder::CreateBufferAddrStr() {
+  auto in_shape1 = shape_info_container_->GetTemplateShape(input_tensors_[FIRST_INPUT]);
+  auto seq_len = in_shape1[0];
+  auto batch = in_shape1[1];
+  auto input_row_align = "(" + seq_len + " * " + batch + " + 3) / 4 * 4";
+  auto state_row_align = "(" + batch + " + 3) / 4 * 4";
+  buffers_str_.push_back("(" + seq_len + " * " + batch + " <= 3) ? NULL : " + buffers_start_);
+  auto offset = "((" + seq_len + " * " + batch + " <= 3) ? 0 : (" + seq_len + " * " + batch + ") * " +
+                std::to_string(lstm_param_->input_size_ * DataTypeSize(data_type_)) + ")";
+  buffers_str_.push_back(buffers_start_ + " + " + offset);
+  offset = "(" + offset + " + " + seq_len + " * " + batch + " * " +
+           std::to_string(C4NUM * lstm_param_->hidden_size_ * DataTypeSize(data_type_)) + ")";
+  buffers_str_.push_back(batch + " <= 3 ? NULL : (" + buffers_start_ + " + " + offset + ")");
+  offset = "(" + offset + " + (" + batch + " <= 3 ? 0 : (" + batch + ") * " +
+           std::to_string(lstm_param_->output_size_ * DataTypeSize(data_type_)) + "))";
+  buffers_str_.push_back(buffers_start_ + " + " + offset);
+  offset = "(" + offset + " + " + batch + " * " +
+           std::to_string(C4NUM * lstm_param_->hidden_size_ * DataTypeSize(data_type_)) + ")";
+  if (lstm_param_->zoneout_cell_ < -FLT_EPSILON || lstm_param_->zoneout_cell_ > FLT_EPSILON) {
+    buffers_str_.push_back(buffers_start_ + " + " + offset);
+    offset =
+      "(" + offset + " + " + batch + " * " + std::to_string(lstm_param_->hidden_size_ * DataTypeSize(data_type_)) + ")";
+  } else {
+    buffers_str_.emplace_back("NULL");
+  }
+  if (lstm_param_->zoneout_hidden_ < -FLT_EPSILON && lstm_param_->zoneout_hidden_ > FLT_EPSILON) {
+    buffers_str_.push_back(buffers_start_ + " + " + offset);
+    offset =
+      "(" + offset + " + " + batch + " * " + std::to_string(lstm_param_->output_size_ * DataTypeSize(data_type_)) + ")";
+  } else {
+    buffers_str_.emplace_back("NULL");
+  }
+  if (lstm_param_->project_size_ == 0) {
+    buffers_str_.emplace_back("NULL");
+  } else {
+    buffers_str_.emplace_back(batch + " <= 3 ? NULL : " + "(" + buffers_start_ + " + " + offset + ")");
+  }
+}
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LSTM,
+                           CPUOpCoderCreator<LstmMindirDynamicFP16Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.h
new file mode 100644
index 00000000..1084fa82
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.h
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_LSTM_DYNAMIC_FP16_CODER_H
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_LSTM_DYNAMIC_FP16_CODER_H
+
+#include <vector>
+#include <string>
+#include "nnacl/lstm_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h"
+#include "coder/opcoders/op_coder.h"
+
+namespace mindspore::lite::micro::nnacl {
+
+class LstmMindirDynamicFP16Coder : public OperatorCoder {
+ public:
+  LstmMindirDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                             const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~LstmMindirDynamicFP16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  int InitParam();
+  int ComputeWorkSpace();
+  void CreateBufferAddrStr();
+  int InitInputWeightBias(CoderContext *const context);
+  int InitStateWeightBias(CoderContext *const context);
+  int InitProjectWeight(CoderContext *const context);
+  bool gpu_state_{false};
+  TypeId data_type_{kNumberTypeFloat16};
+  int weight_segment_num_{0};
+  size_t hi_size_{0};
+  size_t hh_size_{0};
+  size_t hp_size_{0};
+  size_t bias_size_{0};
+  void *weight_i_ptr_{nullptr};
+  void *weight_h_ptr_{nullptr};
+  void *weight_project_ptr_{nullptr};
+  void *input_bias_{nullptr};
+  void *hh_bias_{nullptr};
+  void *project_bias_{nullptr};
+  LstmParameter *lstm_param_{nullptr};
+  DynamicLstmParameter dynamic_lstm_param_;
+  std::string buffers_start_;
+  std::vector<std::string> buffers_str_;
+};
+}  // namespace mindspore::lite::micro::nnacl
+
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_LSTM_DYNAMIC_FP16_CODER_H
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.cc
new file mode 100644
index 00000000..f6c56f86
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.cc
@@ -0,0 +1,228 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h"
+#include <string>
+#include <vector>
+#include "tools/converter/micro/coder/log.h"
+#include "tools/converter/micro/coder/opcoders/file_collector.h"
+#include "base/float16.h"
+#include "tools/common/string_util.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_MatMulFusion;
+
+namespace mindspore::lite::micro::nnacl {
+int MatMulDynamicFP16BaseCoder::Prepare(CoderContext *const context) {
+  row_tile_ = C1NUM;
+  col_tile_ = C4NUM;
+  auto ret = InitAShape();
+  MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "init A-metrics' info failed");
+  ret = InitBShape();
+  MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "init B-metrics' info failed");
+  params_->col_align_ = UP_ROUND(params_->col_, col_tile_);
+  return RET_OK;
+}
+
+int MatMulDynamicFP16BaseCoder::DoCode(CoderContext *const context) {
+  CollectFilesForTarget(context);
+  auto ret = InitMatrixB(context);
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "InitMatrixB failed.");
+  ret = InitBiasData(context);
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "InitBiasData failed.");
+
+  ret = ComputeWorkSpace();
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Matmul alloc workspace failed.");
+  auto input_a_str = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_);
+  MS_CHECK_TRUE_MSG(!input_a_str.empty(), RET_ERROR, "Matmul cannot get matrixA");
+  auto output_str = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_);
+  MS_CHECK_TRUE_MSG(!output_str.empty(), RET_ERROR, "Matmul cannot get output");
+  NNaclFp32Serializer code;
+  if (params_->a_transpose_) {
+    code << "  if (" << dynamic_params_.row_ << " == 1) {\n";
+    code << "    if (" << dynamic_params_.batch_ << " <= 3) {\n";
+    code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + input_a_str + ")", input_b_pack_str_,
+                      "(float16_t *)(" + output_str + ")", bias_str_, params_->act_type_, params_->deep_,
+                      dynamic_params_.batch_, params_->col_, params_->col_, OutType_Nhwc);
+    code << "    } else {\n";
+    code.CodeFunction("RowMajor2ColLadder12MajorFp16", "(float16_t *)(" + input_a_str + ")",
+                      "(float16_t *)(" + buffer_start_ + ")", dynamic_params_.batch_, params_->deep_);
+    code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + buffer_start_ + ")", input_b_pack_str_,
+                      "(float16_t *)(" + output_str + ")", bias_str_, params_->act_type_, params_->deep_,
+                      dynamic_params_.batch_, params_->col_, params_->col_, OutType_Nhwc);
+    code << "  } else {\n";
+    code << "    int in_stride = " << dynamic_params_.row_ << " * " << params_->deep_ << ";\n";
+    code << "    int out_stride = " << dynamic_params_.row_ << " * " << params_->col_ << ";\n";
+    code << "    for (int i = 0; i < " << dynamic_params_.batch_ << "; ++i) {\n";
+    code.CodeFunction("RowMajor2RowLadder12MajorFp16", "(float16_t *)(" + input_a_str + ")" + " + in_stride * i",
+                      "(float16_t *)(" + buffer_start_ + ")", params_->deep_, dynamic_params_.row_);
+    code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + buffer_start_ + ")", input_b_pack_str_,
+                      "(float16_t *)(" + output_str + ")" + " + out_stride * i", bias_str_, params_->act_type_,
+                      params_->deep_, dynamic_params_.row_, params_->col_, OutType_Nhwc);
+    code << "    }\n";
+    code << "  }\n";
+  } else {
+    code << "  if (" << dynamic_params_.batch_ << " * " << dynamic_params_.row_ << " <= 3) {\n";
+    code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + input_a_str + ")", input_b_pack_str_,
+                      "(float16_t *)(" + output_str + ")", bias_str_, params_->act_type_, params_->deep_,
+                      dynamic_params_.batch_ + " * " + dynamic_params_.row_, params_->col_, params_->col_,
+                      OutType_Nhwc);
+    code << "  } else {\n";
+    code.CodeFunction("RowMajor2ColLadder12MajorFp16", "(float16_t *)(" + input_a_str + ")",
+                      "(float16_t *)(" + buffer_start_ + ")", dynamic_params_.batch_ + " * " + dynamic_params_.row_,
+                      params_->deep_);
+    code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + buffer_start_ + ")", input_b_pack_str_,
+                      "(float16_t *)(" + output_str + ")", bias_str_, params_->act_type_, params_->deep_,
+                      dynamic_params_.batch_ + " * " + dynamic_params_.row_, params_->col_, params_->col_,
+                      OutType_Nhwc);
+  }
+  code << "  }\n";
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+int MatMulDynamicFP16BaseCoder::InitMatrixB(CoderContext *const context) {
+  NNaclFp32Serializer init_code;
+  if (b_pack_ptr_ != nullptr) {
+    return RET_OK;
+  }
+  auto b_pack_ptr_size = static_cast<size_t>(params_->col_align_ * params_->deep_ * DataTypeSize(data_type_));
+  b_pack_ptr_ = allocator_->GetSharedWeightAddr(filter_tensor_);
+  if (b_pack_ptr_ == nullptr) {
+    b_pack_ptr_ = allocator_->Malloc(data_type_, b_pack_ptr_size, kOnlinePackWeight,
+                                     filter_tensor_->tensor_name() + "_online_pack");
+    allocator_->MarkSharedWeight(filter_tensor_, b_pack_ptr_);
+  }
+  MS_CHECK_PTR(b_pack_ptr_);
+  std::string input_b_str = allocator_->GetRuntimeAddr(filter_tensor_);
+  input_b_pack_str_ = allocator_->GetRuntimeAddr(static_cast<float16 *>(b_pack_ptr_));
+  init_code.CodeBufferOffsetExpression(b_pack_ptr_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), b_pack_ptr_size);
+  if (b_batch_ == C1NUM) {
+    if (params_->b_transpose_) {
+      init_code.CodeFunction("RowMajor2ColNMajorFp16", input_b_str, input_b_pack_str_, params_->col_, params_->deep_,
+                             "false");
+    } else {
+      init_code.CodeFunction("RowMajor2RowNMajorFp16", input_b_str, input_b_pack_str_, params_->deep_, params_->col_,
+                             "false");
+    }
+  } else {
+    init_code << "  for (int i = 0; i < " << b_batch_ << "; i++) {\n"
+              << "    float16_t *src = " << input_b_str << " + i * " << params_->deep_ * params_->col_ << ";\n"
+              << "    float16_t *dst = " << input_b_pack_str_ << " + i * " << params_->deep_ * params_->col_align_
+              << ";\n";
+    if (params_->b_transpose_) {
+      init_code << "    RowMajor2ColNMajorFp16(src, dst, " << params_->col_ << ", " << params_->deep_ << ", false);\n";
+    } else {
+      init_code << "    RowMajor2RowNMajorFp16(src, dst, " << params_->deep_ << ", " << params_->col_ << ", false);\n";
+    }
+    init_code << "  }\n";
+  }
+  context->AppendInitWeightSizeCode(b_pack_ptr_size);
+  context->AppendInitCode(init_code.str());
+  return RET_OK;
+}
+
+int MatMulDynamicFP16BaseCoder::InitBiasData(CoderContext *const context) {
+  NNaclFp32Serializer init_code;
+  if (bias_ptr_ != nullptr) {
+    return RET_OK;
+  }
+  auto bias_pack_ptr_size = static_cast<size_t>(params_->col_align_ * DataTypeSize(data_type_));
+  if (input_tensors_.size() == C3NUM) {
+    bias_ptr_ =
+      allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, bias_tensor_->tensor_name() + "_online_pack");
+    MS_CHECK_PTR(bias_ptr_);
+  } else {
+    bias_ptr_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, node_->name_ + "_bias_online_pack");
+    MS_CHECK_PTR(bias_ptr_);
+  }
+  init_code.CodeBufferOffsetExpression(bias_ptr_, context->weight_name(), context->weight_offset_name(),
+                                       context->weight_size_name(), bias_pack_ptr_size);
+  bias_str_ = allocator_->GetRuntimeAddr(bias_ptr_);
+  if (input_tensors_.size() == DIMENSION_3D) {
+    auto origin_bias_str = allocator_->GetRuntimeAddr(bias_tensor_);
+    init_code.CodeFunction("memcpy", bias_str_, origin_bias_str, bias_tensor_->Size());
+  } else {
+    init_code.CodeFunction("memset", bias_str_, 0, bias_pack_ptr_size);
+  }
+  context->AppendInitWeightSizeCode(bias_pack_ptr_size);
+  context->AppendInitCode(init_code.str());
+  return RET_OK;
+}
+
+int MatMulDynamicFP16BaseCoder::ComputeWorkSpace() {
+  auto a_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  std::map<std::string, std::vector<int>> real_nums;
+  size_t scene_num = 0;
+  for (auto &dim_template : a_shape) {
+    auto dim_nums = shape_info_container_->GetRealNums(dim_template);
+    MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0.");
+    real_nums[dim_template] = dim_nums;
+    scene_num = std::max(scene_num, dim_nums.size());
+  }
+  for (size_t i = 0; i < scene_num; ++i) {
+    std::vector<int> real_shape(a_shape.size());
+    for (size_t j = 0; j < a_shape.size(); ++j) {
+      if (IsNumber(a_shape[j])) {
+        real_shape[j] = std::stoi(a_shape[j]);
+      } else {
+        real_shape[j] = real_nums[a_shape[j]][i % real_nums[a_shape[j]].size()];
+      }
+    }
+    int a_batch = 1;
+    for (size_t j = 0; j < a_shape.size() - C2NUM; ++j) {
+      MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch, real_shape[j], RET_ERROR);
+      a_batch *= real_shape[j];
+    }
+    int row = params_->a_transpose_ ? real_shape.back() : real_shape[real_shape.size() - C2NUM];
+    int deep = params_->a_transpose_ ? real_shape[real_shape.size() - C2NUM] : real_shape.back();
+    MS_CHECK_TRUE_MSG(deep == params_->deep_, RET_INPUT_TENSOR_ERROR,
+                      "Matmul's matrixA doesn't match matrixB, becase their deeps are not same.");
+    int workspace = 0;
+    if (params_->a_transpose_) {
+      workspace = (row == 1 ? (a_batch <= C3NUM ? 0 : UP_ROUND(a_batch, row_tile_)) : UP_ROUND(row, row_tile_)) * deep;
+    } else {
+      workspace = (a_batch * row <= C3NUM ? 0 : UP_ROUND(a_batch * row, row_tile_)) * deep;
+    }
+    buffer_start_ = dynamic_mem_manager_->AllocWorkSpace(workspace, i);
+    MS_CHECK_TRUE_MSG(!buffer_start_.empty(), RET_ERROR, "Matmul cannot alloc workspace.");
+  }
+  return RET_OK;
+}
+
+int MatMulDynamicFP16BaseCoder::CollectFilesForTarget(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/fp16/pack_fp16.h",
+            "nnacl/fp16/matmul_fp16.h",
+          },
+          {
+            "pack_fp16.c",
+            "matmul_fp16.c",
+          });
+  if (target_ == kARM32) {
+    Collect(context, {}, {},
+            {
+              "Matmul12x8Fp16.S",
+              "MatVecMulFp16.S",
+            });
+  } else if (target_ == kARM64) {
+    Collect(context, {}, {}, {"MatmulFp16OptV2.S"});
+  }
+  return RET_OK;
+}
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h
new file mode 100644
index 00000000..f73cfff7
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h
@@ -0,0 +1,73 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_FP16_BASE_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_FP16_BASE_CODER_H_
+
+#include <vector>
+#include <string>
+#include "tools/converter/micro/coder/opcoders/op_coder.h"
+#include "tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "nnacl/matmul_parameter.h"
+#include "tools/converter/micro/coder/shape_info_container.h"
+#include "tools/converter/micro/coder/dynamic_mem_manager.h"
+#include "base/float16.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/matmul_dynamic_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class MatMulDynamicFP16BaseCoder : public OperatorCoder {
+ public:
+  MatMulDynamicFP16BaseCoder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                             const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~MatMulDynamicFP16BaseCoder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  int InitBiasData(CoderContext *const context);
+  int InitMatrixB(CoderContext *const context);
+  int CollectFilesForTarget(CoderContext *const context);
+  int ComputeWorkSpace();
+
+ protected:
+  virtual int InitAShape() = 0;
+  virtual int InitBShape() = 0;
+
+ protected:
+  Tensor *filter_tensor_{nullptr};
+  Tensor *bias_tensor_{nullptr};
+  MatMulParameter *params_{nullptr};
+  MatmulDynamicParameter dynamic_params_;
+  void *a_pack_ptr_ = nullptr;
+  void *b_pack_ptr_ = nullptr;
+  void *bias_ptr_{nullptr};
+  int col_tile_{0};
+  int row_tile_{0};
+  size_t a_pack_ptr_size_{0};
+  TypeId data_type_{kNumberTypeFloat16};
+  int a_batch_;
+  int b_batch_;
+  std::string buffer_start_;
+  std::string bias_str_;
+  std::string input_a_pack_str_;
+  std::string input_b_pack_str_;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_FP16_BASE_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..24cf7120
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.cc
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.h"
+#include <vector>
+#include "coder/log.h"
+#include "coder/opcoders/file_collector.h"
+#include "tools/common/string_util.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_MatMulFusion;
+
+namespace mindspore::lite::micro::nnacl {
+int MatMulDynamicFP16Coder::InitAShape() {
+  auto a_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  auto a_shape_size = a_shape.size();
+  MS_CHECK_TRUE_MSG(a_shape_size >= DIMENSION_2D, RET_NOT_SUPPORT, "Matmul's a_shape_size must be not less than two.");
+  int64_t const_part = 1;
+  std::string non_const_part;
+  for (size_t i = 0; i < a_shape_size - C2NUM; ++i) {
+    if (IsNumber(a_shape[i])) {
+      const_part *= std::atoi(a_shape[i].c_str());
+    } else {
+      if (!non_const_part.empty()) {
+        non_const_part += " * ";
+      }
+      non_const_part += a_shape[i];
+    }
+  }
+  dynamic_params_.batch_ = non_const_part + " * " + std::to_string(const_part);
+  dynamic_params_.row_ = params_->a_transpose_ ? a_shape[a_shape.size() - C1NUM] : a_shape[a_shape.size() - C2NUM];
+  return RET_OK;
+}
+
+int MatMulDynamicFP16Coder::InitBShape() {
+  std::vector<int> b_shape = filter_tensor_->shape();
+  MS_CHECK_TRUE_MSG(b_shape.size() >= DIMENSION_2D, RET_NOT_SUPPORT,
+                    "Matmul's b_shape_size must be not less than two.");
+  int batch = 1;
+  for (size_t i = 0; i < b_shape.size() - DIMENSION_2D; ++i) {
+    batch *= b_shape[i];
+  }
+  if (batch != 1) {
+    MS_LOG(ERROR) << "Currently, Matmul only support matrixB's batch is 1.";
+  }
+  b_batch_ = batch;
+  params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - C2NUM] : b_shape[b_shape.size() - C1NUM];
+  params_->col_8_ = UP_ROUND(params_->col_, C8NUM);
+  params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - C1NUM] : b_shape[b_shape.size() - C2NUM];
+  return RET_OK;
+}
+
+int MatMulDynamicFP16Coder::Prepare(CoderContext *const context) {
+  for (size_t i = 0; i < input_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                      "Input tensor data type is invalid.");
+  }
+  MS_CHECK_TRUE_MSG(output_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                    "Input tensor data type is invalid.");
+  MS_CHECK_TRUE_MSG(input_tensors_.size() == C2NUM || input_tensors_.size() == C3NUM, RET_INPUT_PARAM_INVALID,
+                    "MatMul's input-num must be 2 or 3.");
+  MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst(), RET_NOT_SUPPORT,
+                    "Currently, only support the first input of matmul is non-const when shape is dynamical.");
+  if (input_tensors_.size() == C3NUM) {
+    MS_CHECK_TRUE_MSG(input_tensors_[THIRD_INPUT]->IsConst(), RET_NOT_SUPPORT,
+                      "Currently, only support the first input of matmul is non-const when shape is dynamical.");
+  }
+  params_ = reinterpret_cast<MatMulParameter *>(parameter_);
+  filter_tensor_ = input_tensors_.at(kWeightIndex);
+  MS_CHECK_PTR(filter_tensor_);
+  if (input_tensors_.size() == kInputSize2) {
+    bias_tensor_ = input_tensors_.at(kBiasIndex);
+    MS_CHECK_PTR(bias_tensor_);
+    MS_CHECK_PTR(bias_tensor_->data());
+  }
+  params_->a_const_ = (input_tensor_->data() != nullptr);
+  params_->b_const_ = (filter_tensor_->data() != nullptr);
+  MS_CHECK_RET_CODE(MatMulDynamicFP16BaseCoder::Prepare(context), "MatMulDynamicFP16Coder prepare failed");
+  return RET_OK;
+}
+
+int MatMulDynamicFP16Coder::DoCode(CoderContext *const context) { return MatMulDynamicFP16BaseCoder::DoCode(context); }
+
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_MatMulFusion,
+                           CPUOpCoderCreator<MatMulDynamicFP16Coder>)
+// REG_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_MatMulFusion, CPUOpCoderCreator<MatMulDynamicFP16Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.h
new file mode 100644
index 00000000..1a16798c
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include "tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h"
+#include "nnacl/matmul_parameter.h"
+#include "tools/converter/micro/coder/shape_info_container.h"
+#include "tools/converter/micro/coder/dynamic_mem_manager.h"
+
+namespace mindspore::lite::micro::nnacl {
+class MatMulDynamicFP16Coder final : public MatMulDynamicFP16BaseCoder {
+ public:
+  MatMulDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                         const LiteGraph::Node *node, size_t node_index, Target target)
+      : MatMulDynamicFP16BaseCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~MatMulDynamicFP16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  int InitAShape() override;
+  int InitBShape() override;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc
index 67f633fe..415e912d 100644
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc
@@ -102,14 +102,15 @@ std::string MatMulFP16BaseCoder::InitMatrixA(NNaclFp32Serializer *const code, NN
   if (a_batch_ == 1) {
     if (params_.a_transpose_) {
       if (target_ == kARM64) {
-        pack_code.CodeFunction("RowMajor2RowNMajorFp16", input_a_str, input_a_pack_str, params_.deep_, params_.row_);
+        pack_code.CodeFunction("RowMajor2RowNMajorFp16", input_a_str, input_a_pack_str, params_.deep_, params_.row_,
+                               "false");
       } else {
         pack_code.CodeFunction("RowMajor2Row12MajorFp16", input_a_str, input_a_pack_str, params_.deep_, params_.row_,
                                false);
       }
     } else {
       if (target_ == kARM64) {
-        pack_code.CodeFunction("RowMajor2ColNMajorFp16", input_a_str, input_a_pack_str, params_.row_, params_.deep_);
+        pack_code.CodeFunction("RowMajor2ColNMajorFp16", input_a_str, input_a_pack_str, params_.row_, params_.deep_, false);
       } else {
         pack_code.CodeFunction("RowMajor2Col12MajorFp16", input_a_str, input_a_pack_str, params_.row_, params_.deep_,
                                false);
@@ -122,13 +123,13 @@ std::string MatMulFP16BaseCoder::InitMatrixA(NNaclFp32Serializer *const code, NN
               << ";\n";
     if (params_.a_transpose_) {
       if (target_ == kARM64) {
-        pack_code << "    RowMajor2RowNMajorFp16(src, dst, " << params_.deep_ << ", " << params_.row_ << ");\n";
+        pack_code << "    RowMajor2RowNMajorFp16(src, dst, " << params_.deep_ << ", " << params_.row_ << ", false);\n";
       } else {
         pack_code << "    RowMajor2Row12MajorFp16(src, dst, " << params_.deep_ << ", " << params_.row_ << ", false);\n";
       }
     } else {
       if (target_ == kARM64) {
-        pack_code << "    RowMajor2ColNMajorFp16(src, dst, " << params_.row_ << ", " << params_.deep_ << ");\n";
+        pack_code << "    RowMajor2ColNMajorFp16(src, dst, " << params_.row_ << ", " << params_.deep_ << ", false);\n";
       } else {
         pack_code << "    RowMajor2Col12MajorFp16(src, dst, " << params_.row_ << ", " << params_.deep_ << ", false);\n";
       }
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..c565f5b2
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.cc
@@ -0,0 +1,89 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.h"
+#include <cfloat>
+#include <string>
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/log.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_AvgPoolFusion;
+using mindspore::schema::PrimitiveType_MaxPoolFusion;
+
+namespace mindspore::lite::micro::nnacl {
+int PoolingDynamicFP16Coder::Prepare(CoderContext *const context) {
+  if (input_tensor_->data_type() != kNumberTypeFloat16 || output_tensor_->data_type() != kNumberTypeFloat16) {
+    MS_LOG(ERROR) << "Tensor data type is invalid";
+    return lite::RET_INPUT_PARAM_INVALID;
+  }
+  param_ = reinterpret_cast<PoolingParameter *>(parameter_);
+  MS_CHECK_PTR(param_);
+  dynamic_param_.input_batch_ = shape_info_container_->GetTemplateShape(input_tensor_)[0];
+  compute_.input_channel_ = input_tensor_->Channel();
+  compute_.input_h_ = input_tensor_->Height();
+  compute_.input_w_ = input_tensor_->Width();
+  dynamic_param_.output_batch_ = shape_info_container_->GetTemplateShape(output_tensor_)[0];
+  compute_.output_channel_ = output_tensor_->Channel();
+  compute_.output_h_ = output_tensor_->Height();
+  compute_.output_w_ = output_tensor_->Width();
+  if (param_->global_) {
+    param_->window_h_ = compute_.input_h_;
+    param_->window_w_ = compute_.input_w_;
+  }
+  return RET_OK;
+}
+
+int PoolingDynamicFP16Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/fp16/pooling_fp16.h",
+          },
+          {
+            "pooling_fp16.c",
+          });
+  NNaclFp32Serializer code;
+  code.CodeStruct("pooling_parameter", *param_);
+  code.CodeStruct("pooling_compute", compute_, dynamic_param_);
+
+  auto input_data =
+    "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  auto output_data =
+    "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  if (param_->pool_mode_ == PoolMode_MaxPool) {
+    code.CodeFunction("MaxPoolingFp16", input_data, output_data, "&pooling_parameter", "&pooling_compute",
+                      kDefaultTaskId, param_->op_parameter_.thread_num_);
+  } else if (param_->pool_mode_ == PoolMode_AvgPool) {
+    code.CodeFunction("AvgPoolingFp16", input_data, output_data, "&pooling_parameter", "&pooling_compute",
+                      kDefaultTaskId, param_->op_parameter_.thread_num_);
+  } else {
+    MS_LOG(ERROR) << "Unsupported pooling mode.";
+    return lite::RET_ERROR;
+  }
+  context->AppendCode(code.str());
+  return lite::RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_AvgPoolFusion,
+                           CPUOpCoderCreator<PoolingDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_AvgPoolFusion,
+                           CPUOpCoderCreator<PoolingDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_MaxPoolFusion,
+                           CPUOpCoderCreator<PoolingDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_MaxPoolFusion,
+                           CPUOpCoderCreator<PoolingDynamicFP16Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.h
new file mode 100644
index 00000000..7b138b61
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.h
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_POOLING_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_POOLING_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include "coder/opcoders/op_coder.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h"
+#include "nnacl/pooling_parameter.h"
+#include "nnacl/kernel/pooling.h"
+
+namespace mindspore::lite::micro::nnacl {
+class PoolingDynamicFP16Coder final : public OperatorCoder {
+ public:
+  PoolingDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                          const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+  ~PoolingDynamicFP16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  PoolingParameter *param_{nullptr};
+  PoolingComputeParam compute_;
+  PoolingDynamicParameter dynamic_param_;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_POOLING_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..733cf49d
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.cc
@@ -0,0 +1,128 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.h"
+#include <string>
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_ScaleFusion;
+
+namespace mindspore::lite::micro::nnacl {
+int ScaleDynamicFP16Coder::Prepare(CoderContext *const context) {
+  for (size_t i = 0; i < input_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                      "Input tensor data type should be fp16, now is " << input_tensors_[i]->data_type());
+  }
+  MS_CHECK_TRUE_MSG(output_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                    "Output tensor data type should be fp16, now is " << output_tensor_->data_type());
+
+  scale_param_ = reinterpret_cast<ScaleParameter *>(parameter_);
+  MS_CHECK_PTR(scale_param_);
+  scale_struct_.base_.param_ = parameter_;
+  if (input_tensors_.size() < DIMENSION_2D || input_tensors_.size() > DIMENSION_3D) {
+    MS_LOG(ERROR) << "inputs to Scale operator should be 2 or 3, but " << input_tensors_.size() << " is given.";
+    return RET_ERROR;
+  }
+  scale_tensor_ = input_tensors_.at(kWeightIndex);
+  MS_CHECK_PTR(scale_tensor_);
+  MS_CHECK_RET_CODE(CalculateParameter(), "Scale fp16 CalculateParameter failed.");
+  return RET_OK;
+}
+
+int ScaleDynamicFP16Coder::DoCode(CoderContext *const context) {
+  // init struct ScaleParameters
+  Collect(context,
+          {
+            "nnacl/kernel/scale.h",
+            "nnacl/fp16/scale_fp16.h",
+          },
+          {
+            "scale_fp16.c",
+          });
+
+  NNaclFp32Serializer code;
+  code.CodeStruct("scale_struct", scale_struct_, dynamic_param_);
+
+  auto scale = GetTensorAddr(scale_tensor_, scale_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  std::string offset{"NULL"};
+  if (input_tensors_.size() == DIMENSION_3D) {
+    auto offset_tensor = input_tensors_.at(kBiasIndex);
+    offset = GetTensorAddr(offset_tensor, offset_tensor->IsConst(), dynamic_mem_manager_, allocator_);
+  }
+  std::string input_str =
+    "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  std::string output_str =
+    "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  switch (scale_param_->activation_type_) {
+    case schema::ActivationType_RELU6:
+      code.CodeFunction("DoScaleRelu6Fp16", input_str, output_str, scale, offset, kDefaultTaskId, "&scale_struct");
+      break;
+    case schema::ActivationType_RELU:
+      code.CodeFunction("Fp16DoScaleRelu", input_str, output_str, scale, offset, kDefaultTaskId, "&scale_struct");
+      break;
+    case schema::ActivationType_NO_ACTIVATION:
+      code.CodeFunction("DoScaleFp16", input_str, output_str, scale, offset, kDefaultTaskId, "&scale_struct");
+      break;
+    default:
+      MS_LOG(ERROR) << "Scale does not support activation type " << scale_param_->activation_type_;
+      return RET_ERROR;
+  }
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+int ScaleDynamicFP16Coder::CalculateParameter() {
+  auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  std::vector<std::string> scale_shape;
+  if (scale_tensor_->IsConst()) {
+    for (auto dim : scale_tensor_->shape()) {
+      scale_shape.emplace_back(std::to_string(dim));
+    }
+  } else {
+    scale_shape = shape_info_container_->GetTemplateShape(scale_tensor_);
+  }
+  if (scale_param_->axis_ < 0) {
+    scale_struct_.axis_ = scale_param_->axis_ + in_shape.size();
+  }
+  if (scale_shape.size() + scale_struct_.axis_ > in_shape.size()) {
+    MS_LOG(ERROR) << "Scale tensor shape is incorrect.";
+    return RET_ERROR;
+  }
+  dynamic_param_.outer_size_ = AccumulateShape(in_shape, 0, scale_struct_.axis_);
+  if (scale_tensor_->IsConst() && scale_tensor_->shape().size() == 1) {
+    dynamic_param_.axis_size_ = in_shape.at(scale_struct_.axis_);
+  } else {
+    dynamic_param_.axis_size_ = "{";
+    for (size_t i = 0; i < scale_shape.size(); i++) {
+      if (in_shape.at(i + scale_struct_.axis_) != scale_shape.at(i)) {
+        MS_LOG(ERROR) << "Scale tensor shape is incorrect.";
+        return RET_ERROR;
+      }
+      dynamic_param_.axis_size_ += in_shape.at(i + scale_struct_.axis_) + ", ";
+    }
+    dynamic_param_.axis_size_ += "}";
+  }
+  dynamic_param_.inner_size_ = AccumulateShape(in_shape, scale_struct_.axis_ + scale_shape.size(), in_shape.size());
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_ScaleFusion,
+                           CPUOpCoderCreator<ScaleDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_ScaleFusion,
+                           CPUOpCoderCreator<ScaleDynamicFP16Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.h
new file mode 100644
index 00000000..02ec35ba
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SCALE_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SCALE_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include "coder/opcoders/op_coder.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h"
+#include "nnacl/kernel/scale.h"
+#include "nnacl/scale_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class ScaleDynamicFP16Coder final : public OperatorCoder {
+ public:
+  ScaleDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                        const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+  ~ScaleDynamicFP16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  int CalculateParameter();
+  ScaleParameter *scale_param_{nullptr};
+  ScaleStruct scale_struct_;
+  ScaleDynamicParameter dynamic_param_;
+  Tensor *scale_tensor_{nullptr};
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SCALE_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..1c6969b2
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.cc
@@ -0,0 +1,160 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_SliceFusion;
+
+namespace mindspore::lite::micro::nnacl {
+int SliceDynamicFP16Coder::Prepare(CoderContext *const context) {
+  CHECK_LESS_RETURN(input_tensors_.size(), C3NUM);
+  CHECK_LESS_RETURN(output_tensors_.size(), 1);
+  CHECK_NULL_RETURN(input_tensors_[FIRST_INPUT]);
+  CHECK_NULL_RETURN(input_tensors_[SECOND_INPUT]);
+  CHECK_NULL_RETURN(input_tensors_[THIRD_INPUT]);
+  CHECK_NULL_RETURN(output_tensor_);
+  param_ = reinterpret_cast<SliceParameter *>(parameter_);
+  CHECK_NULL_RETURN(param_);
+  MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst() && input_tensors_[THIRD_INPUT]->IsConst(), RET_NOT_SUPPORT,
+                    "The second and third input of slice is non-const.");
+  MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt32 &&
+                      input_tensors_[THIRD_INPUT]->data_type() == kNumberTypeInt32,
+                    RET_INPUT_PARAM_INVALID, "second or third input tensor data type need to be int32.");
+  if (input_tensor_->data_type() != kNumberTypeFloat16 || output_tensor_->data_type() != kNumberTypeFloat16) {
+    MS_LOG(ERROR) << "Tensor data type is invalid";
+    return lite::RET_INPUT_PARAM_INVALID;
+  }
+  return Init();
+}
+
+int SliceDynamicFP16Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/base/slice_base.h",
+          },
+          {
+            "slice_base.c",
+          });
+  NNaclFp32Serializer code;
+  code.CodeStruct("slice_param", *param_, dynamic_param_);
+  std::string input_data = GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  std::string output_data = GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  if (!support_parallel_) {
+    code.CodeFunction("DoSliceNoParallel", input_data, output_data, "&slice_param",
+                      DataTypeSize(input_tensor_->data_type()));
+  }
+  context->AppendCode(code.str());
+  return NNACL_OK;
+}
+
+int SliceDynamicFP16Coder::Init() {
+  auto begin_tensor = input_tensors_[SECOND_INPUT];
+  auto size_tensor = input_tensors_[THIRD_INPUT];
+  data_shape_ = shape_info_container_->GetTemplateShape(input_tensor_);
+  MS_CHECK_TRUE_MSG(data_shape_.size() == static_cast<size_t>(begin_tensor->ElementsNum()), RET_ERROR,
+                    "The begin tensor is invalid.");
+  MS_CHECK_TRUE_MSG(data_shape_.size() == static_cast<size_t>(size_tensor->ElementsNum()), RET_ERROR,
+                    "The size tensor is invalid.");
+  auto begin = reinterpret_cast<int32_t *>(begin_tensor->data());
+  CHECK_NULL_RETURN(begin);
+  auto size = reinterpret_cast<int32_t *>(size_tensor->data());
+  CHECK_NULL_RETURN(size);
+  param_->param_length_ = static_cast<int>(data_shape_.size());
+  if (param_->param_length_ > DIMENSION_8D) {
+    MS_LOG(ERROR) << "input dimension num should <= " << DIMENSION_8D;
+    return RET_ERROR;
+  }
+  dynamic_param_.shape_ = "{";
+  dynamic_param_.size_ = "{";
+  dynamic_param_.end_ = "{";
+  for (int i = 0; i < param_->param_length_; ++i) {
+    dynamic_param_.shape_ += data_shape_[i] + ", ";
+    param_->begin_[i] = begin[i];
+    if (size[i] < 0) {
+      std::string cur_size = data_shape_[i] + " - " + std::to_string(begin[i]);
+      slice_size_.emplace_back(cur_size);
+      dynamic_param_.size_ += cur_size + ", ";
+    } else {
+      slice_size_.emplace_back(std::to_string(size[i]));
+      dynamic_param_.size_ += std::to_string(size[i]) + ", ";
+    }
+    std::string cur_end = std::to_string(param_->begin_[i]) + " + " + slice_size_[i];
+    end_.emplace_back(cur_end);
+    dynamic_param_.end_ += cur_end + ", ";
+  }
+  dynamic_param_.shape_ += "}";
+  dynamic_param_.size_ += "}";
+  dynamic_param_.end_ += "}";
+  if (param_->param_length_ < DIMENSION_8D) {
+    PadSliceParameterTo8D();
+  }
+  return RET_OK;
+}
+
+void SliceDynamicFP16Coder::PadSliceParameterTo8D() {
+  std::vector<int32_t> begin(DIMENSION_8D, 0);
+  std::vector<std::string> end(DIMENSION_8D, "");
+  std::vector<std::string> slice_size(DIMENSION_8D, "");
+  std::vector<std::string> data_shape(DIMENSION_8D, "");
+  for (int32_t i = 0; i < param_->param_length_; ++i) {
+    begin[i] = param_->begin_[i];
+    end[i] = end_[i];
+    slice_size[i] =
+      slice_size_[i] + " < 0 ? " + data_shape[i] + " - " + std::to_string(begin[i]) + " : " + slice_size_[i];
+    data_shape[i] = data_shape_[i];
+  }
+  data_shape_.resize(DIMENSION_8D);
+  slice_size_.resize(DIMENSION_8D);
+  end_.resize(DIMENSION_8D);
+  int32_t real_index = param_->param_length_ - 1;
+  for (int32_t i = DIMENSION_8D - 1; i >= 0; --i) {
+    if (real_index >= 0) {
+      param_->begin_[i] = begin[real_index];
+      end_[i] = end[real_index];
+      slice_size_[i] = slice_size[real_index];
+      data_shape_[i] = data_shape[real_index--];
+    } else {
+      param_->begin_[i] = 0;
+      end_[i] = "1";
+      slice_size_[i] = "1";
+      data_shape_[i] = "1";
+    }
+  }
+  param_->param_length_ = DIMENSION_8D;
+  dynamic_param_.shape_.clear();
+  dynamic_param_.size_.clear();
+  dynamic_param_.end_.clear();
+  dynamic_param_.shape_ = "{";
+  dynamic_param_.size_ = "{";
+  dynamic_param_.end_ = "{";
+  for (int i = 0; i < DIMENSION_8D; ++i) {
+    dynamic_param_.end_ += end_[i] + ", ";
+    dynamic_param_.size_ += slice_size_[i] + ", ";
+    dynamic_param_.shape_ += data_shape_[i] + ", ";
+  }
+  dynamic_param_.shape_ += "}";
+  dynamic_param_.size_ += "}";
+  dynamic_param_.end_ += "}";
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_SliceFusion,
+                           CPUOpCoderCreator<SliceDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_SliceFusion,
+                           CPUOpCoderCreator<SliceDynamicFP16Coder>)
+};  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.h
new file mode 100644
index 00000000..21b1b27b
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SLICE_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SLICE_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include "mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h"
+#include "nnacl/slice_parameter.h"
+#include "nnacl/op_base.h"
+
+namespace mindspore::lite::micro::nnacl {
+class SliceDynamicFP16Coder final : public OperatorCoder {
+ public:
+  SliceDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                        const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~SliceDynamicFP16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ protected:
+  int Init();
+  void PadSliceParameterTo8D();
+  SliceParameter *param_{nullptr};
+  SliceDynamicParameter dynamic_param_;
+  std::vector<std::string> in_shapes_;
+  std::vector<std::string> out_shapes_;
+  std::vector<std::string> data_shape_;
+  std::vector<std::string> slice_size_;
+  std::vector<std::string> end_;
+};
+};      // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SLICE_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..1bd09fb5
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.cc
@@ -0,0 +1,137 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.h"
+#include <string>
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "schema/inner/ops_generated.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/utils/coder_utils.h"
+#include "tools/common/string_util.h"
+#include "base/float16.h"
+
+using mindspore::schema::PrimitiveType_LogSoftmax;
+using mindspore::schema::PrimitiveType_Softmax;
+
+namespace mindspore::lite::micro::nnacl {
+int SoftmaxDynamicFP16Coder::Prepare(CoderContext *const context) {
+  for (size_t i = 0; i < input_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                      "Input tensor data type is invalid");
+  }
+  for (size_t i = 0; i < output_tensors_.size(); ++i) {
+    MS_CHECK_TRUE_MSG(output_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                      "Output tensor data type is invalid");
+  }
+  auto ret = Init();
+  MS_CHECK_RET_CODE(ret, "Init failed!");
+  return RET_OK;
+}
+
+int SoftmaxDynamicFP16Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/fp16/softmax_fp16.h",
+            "nnacl/fp16/log_softmax_fp16.h",
+          },
+          {
+            "softmax_fp16.c",
+            "log_softmax_fp16.c",
+            "exp_fp16.c",
+          });
+
+  auto ret = ComputeWorkSpace();
+  MS_CHECK_RET_CODE(ret, "ComputeWorkSpace failed!");
+  NNaclFp32Serializer code;
+  sum_data_str_ = "(float16_t *)(" + buffer_start_ + ")";
+  auto primitive_type = param_->op_parameter_.type_;
+  std::string input_data =
+    "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  std::string output_data =
+    "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")";
+  code << "    int input_shape[" << input_shape_.size() << "] = " << dynamic_param_.input_shape_ << ";\n";
+  if (primitive_type == schema::PrimitiveType_Softmax) {
+    code.CodeFunction("SoftmaxFp16", input_data, output_data, sum_data_str_, softmax_struct_.axis_,
+                      softmax_struct_.n_dim_, "&input_shape");
+  } else {
+    code.CodeFunction("LogSoftmaxFp16", input_data, output_data, sum_data_str_, "&input_shape", softmax_struct_.n_dim_,
+                      softmax_struct_.axis_);
+  }
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+int SoftmaxDynamicFP16Coder::Init() {
+  param_ = reinterpret_cast<SoftmaxParameter *>(parameter_);
+  MS_CHECK_PTR(param_);
+  softmax_struct_.base_.param_ = parameter_;
+  input_shape_ = shape_info_container_->GetTemplateShape(input_tensor_);
+  size_t in_dims = input_shape_.size();
+  softmax_struct_.n_dim_ = in_dims;
+  softmax_struct_.axis_ = param_->axis_ < 0 ? param_->axis_ + softmax_struct_.n_dim_ : param_->axis_;
+  dynamic_param_.element_size_ = AccumulateShape(input_shape_, 0, input_shape_.size());
+  dynamic_param_.input_shape_ = "{";
+  for (size_t i = 0; i < input_shape_.size(); ++i) {
+    dynamic_param_.input_shape_ += input_shape_[i] + ", ";
+  }
+  dynamic_param_.input_shape_ += "}";
+  return RET_OK;
+}
+
+int SoftmaxDynamicFP16Coder::ComputeWorkSpace() {
+  std::map<std::string, std::vector<int>> real_nums;
+  size_t scene_num = 0;
+  for (auto &dim_template : input_shape_) {
+    auto dim_nums = shape_info_container_->GetRealNums(dim_template);
+    MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0.");
+    real_nums[dim_template] = dim_nums;
+    scene_num = std::max(scene_num, dim_nums.size());
+  }
+  for (size_t i = 0; i < scene_num; ++i) {
+    std::vector<int> real_shape(input_shape_.size());
+    for (size_t j = 0; j < input_shape_.size(); ++j) {
+      if (IsNumber(input_shape_[j])) {
+        real_shape[j] = std::stoi(input_shape_[j]);
+      } else {
+        real_shape[j] = real_nums[input_shape_[j]][i % real_nums[input_shape_[j]].size()];
+      }
+    }
+    int out_plane_size = 1;
+    for (int j = 0; j < softmax_struct_.axis_; ++j) {
+      MS_CHECK_INT_MUL_NOT_OVERFLOW(out_plane_size, real_shape[j], RET_ERROR);
+      out_plane_size *= real_shape[j];
+    }
+    int in_plane_size = 1;
+    for (int j = softmax_struct_.axis_ + 1; j < softmax_struct_.n_dim_; ++j) {
+      MS_CHECK_INT_MUL_NOT_OVERFLOW(in_plane_size, real_shape[j], RET_ERROR);
+      in_plane_size *= real_shape[j];
+    }
+    int workspace = out_plane_size * in_plane_size * sizeof(float16);
+    buffer_start_ = dynamic_mem_manager_->AllocWorkSpace(workspace, i);
+    MS_CHECK_TRUE_MSG(!buffer_start_.empty(), RET_ERROR, "Softmax cannot alloc workspace.");
+  }
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Softmax,
+                           CPUOpCoderCreator<SoftmaxDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Softmax,
+                           CPUOpCoderCreator<SoftmaxDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_LogSoftmax,
+                           CPUOpCoderCreator<SoftmaxDynamicFP16Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LogSoftmax,
+                           CPUOpCoderCreator<SoftmaxDynamicFP16Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.h
new file mode 100644
index 00000000..913f5ad4
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SOFTMAX_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SOFTMAX_DYNAMIC_FP16_CODER_H_
+
+#include <vector>
+#include <string>
+#include "coder/opcoders/op_coder.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/softmax_dynamic_parameter.h"
+#include "nnacl/softmax_parameter.h"
+#include "nnacl/kernel/softmax.h"
+
+namespace mindspore::lite::micro::nnacl {
+class SoftmaxDynamicFP16Coder final : public OperatorCoder {
+ public:
+  SoftmaxDynamicFP16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                          const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+  ~SoftmaxDynamicFP16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  int Init();
+  int ComputeWorkSpace();
+  SoftmaxParameter *param_{nullptr};
+  SoftmaxStruct softmax_struct_;
+  SoftmaxDynamicParameter dynamic_param_;
+  std::vector<std::string> input_shape_;
+  std::string buffer_start_;
+  std::string sum_data_str_;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SOFTMAX_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.cc
new file mode 100644
index 00000000..59c8d8b8
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.cc
@@ -0,0 +1,76 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_Transpose;
+namespace mindspore::lite::micro::nnacl {
+int TransposeDynamicFp16Coder::Prepare(CoderContext *const context) {
+  MS_CHECK_TRUE_MSG(input_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID,
+                    "Input tensor data type is invalid.");
+  MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt32, RET_INPUT_PARAM_INVALID,
+                    "Perm tensor data type is invalid.");
+  MS_CHECK_TRUE_MSG(
+    output_tensor_->data_type() == kNumberTypeInt32 || output_tensor_->data_type() == kNumberTypeFloat16,
+    RET_INPUT_PARAM_INVALID, "Output tensor data type is invalid.");
+  MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst(), RET_NOT_SUPPORT,
+                    "The second input of transpose is non-const.");
+  thread_num_ = 1;
+  MS_CHECK_RET_CODE(Init(), "init failed");
+  return RET_OK;
+}
+
+int TransposeDynamicFp16Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/transpose_parameter.h",
+            "nnacl/errorcode.h",
+            "nnacl/fp16/transpose_fp16.h",
+          },
+          {
+            "transpose_fp16.c",
+          });
+
+  NNaclFp32Serializer code;
+  dims_ = static_cast<int>(out_shapes_.size());
+  code << "const int32_t output_shape[" << dims_ << "] = {";
+  for (size_t i = 0; i < out_shapes_.size(); ++i) {
+    code << out_shapes_[i] << ", ";
+  }
+  code << "};\n";
+  code.CodeStruct("trans_param", *param_, dynamic_param_);
+  auto input_str = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_);
+  auto output_str = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_);
+  if (param_->num_axes_ > DIMENSION_6D) {
+    code.CodeFunction("TransposeDimsFp16", input_str, output_str, "output_shape", "trans_param.perm_",
+                      "trans_param.strides_", "trans_param.out_strides_", "trans_param.num_axes_", kDefaultTaskId,
+                      kDefaultThreadNum);
+  } else {
+    code.CodeFunction("DoTransposeFp16", input_str, output_str, "output_shape", "trans_param.perm_",
+                      "trans_param.strides_", "trans_param.out_strides_", "trans_param.data_num_",
+                      "trans_param.num_axes_");
+  }
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Transpose,
+                           CPUOpCoderCreator<TransposeDynamicFp16Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.h
new file mode 100644
index 00000000..e008a794
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_TRANSPOSE_DYNAMIC_FP16_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_TRANSPOSE_DYNAMIC_FP16_CODER_H_
+#include <vector>
+#include <string>
+#include "coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h"
+
+namespace mindspore::lite::micro::nnacl {
+class TransposeDynamicFp16Coder : public TransposeDynamicFp32Coder {
+ public:
+  TransposeDynamicFp16Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                            const LiteGraph::Node *node, size_t node_index, Target target)
+      : TransposeDynamicFp32Coder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~TransposeDynamicFp16Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_TRANSPOSE_DYNAMIC_FP16_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.cc
new file mode 100644
index 00000000..1dd33bbd
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.cc
@@ -0,0 +1,112 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h"
+#include <string>
+#include "nnacl/fp32/activation_fp32.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/parallel.h"
+#include "tools/common/string_util.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_Activation;
+
+namespace mindspore::lite::micro::nnacl {
+int ActivationDynamicFP32Coder::Preprocess() {
+  // attribute
+  auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  int64_t const_part = 1;
+  std::string non_const_part;
+  for (const auto &item : in_shape) {
+    if (IsNumber(item)) {
+      const_part *= std::atoi(item.c_str());
+    } else {
+      if (!non_const_part.empty()) {
+        non_const_part += " * ";
+      }
+      non_const_part += item;
+    }
+  }
+  count_ = std::to_string(const_part) + " * " + non_const_part;
+  input_data_ = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_);
+  MS_CHECK_TRUE_MSG(!input_data_.empty(), RET_ERROR, "pointer is not allocated by the allocator");
+  output_data_ = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_);
+  MS_CHECK_TRUE_MSG(!output_data_.empty(), RET_ERROR, "pointer is not allocated by the allocator");
+  return RET_OK;
+}
+
+int ActivationDynamicFP32Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "wrapper/fp32/activation_fp32_wrapper.h",
+            "nnacl/fp32/activation_fp32.h",
+          },
+          {
+            "activation_fp32_wrapper.c",
+            "activation_fp32.c",
+          });
+  NNaclFp32Serializer code;
+  auto *activation_parameter = reinterpret_cast<ActivationParameter *>(parameter_);
+  int ret = Preprocess();
+  MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "Preprocess failed");
+
+  switch (activation_parameter->type_) {
+    case schema::ActivationType_RELU:
+      code.CodeFunction("Fp32Relu", input_data_, count_, output_data_);
+      break;
+    case schema::ActivationType_RELU6:
+      code.CodeFunction("Fp32Relu6", input_data_, count_, output_data_);
+      break;
+    case schema::ActivationType_LEAKY_RELU:
+      code.CodeFunction("LRelu", input_data_, count_, output_data_, activation_parameter->alpha_);
+      break;
+    case schema::ActivationType_SIGMOID:
+      if (!support_parallel_) {
+        code.CodeFunction("Sigmoid", input_data_, count_, output_data_);
+      } else {
+        code.CodeStruct("activation_param", *activation_parameter);
+        code.CodeBaseStruct("ActivationFp32Args", kRunArgs, input_data_, count_, output_data_, 0.0f,
+                            "&activation_param");
+        code.CodeFunction(kParallelLaunch, "DoSigmoid", kRunArgsAddr, "activation_param.op_parameter_.thread_num_");
+      }
+      break;
+    case schema::ActivationType_TANH:
+      code.CodeFunction("Tanh", input_data_, count_, output_data_);
+      break;
+    case schema::ActivationType_HSWISH:
+      code.CodeFunction("HSwish", input_data_, count_, output_data_);
+      break;
+    case schema::ActivationType_SWISH:
+      code.CodeFunction("Swish", input_data_, count_, output_data_);
+      break;
+    case schema::ActivationType_HSIGMOID:
+      code.CodeFunction("HSigmoid", input_data_, count_, output_data_);
+      break;
+    case schema::ActivationType_ELU:
+      code.CodeFunction("Elu", input_data_, count_, output_data_, activation_parameter->alpha_);
+      break;
+    default:
+      MS_LOG(ERROR) << "Activation type error";
+      return RET_ERROR;
+  }
+  MS_LOG(DEBUG) << "ActivationFP32Code has been called";
+  context->AppendCode(code.str());
+  return lite::RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Activation,
+                           CPUOpCoderCreator<ActivationDynamicFP32Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h
new file mode 100644
index 00000000..1560afbb
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_ACTIVATION_DYNAMIC_FP32_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_ACTIVATION_DYNAMIC_FP32_CODER_H_
+
+#include <string>
+#include <vector>
+#include "tools/converter/micro/coder/opcoders/op_coder.h"
+#include "tools/converter/micro/coder/shape_info_container.h"
+#include "tools/converter/micro/coder/dynamic_mem_manager.h"
+
+namespace mindspore::lite::micro::nnacl {
+class ActivationDynamicFP32Coder : public OperatorCoder {
+ public:
+  ActivationDynamicFP32Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                             const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~ActivationDynamicFP32Coder() override = default;
+
+  int Prepare(CoderContext *const context) override { return RET_OK; }
+
+  int DoCode(CoderContext *const context) override;
+
+ protected:
+  int Preprocess();
+  std::string count_;
+  std::string input_data_;
+  std::string output_data_;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_ACTIVATION_DYNAMIC_FP32_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc
index c15d3101..1b827283 100644
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc
@@ -266,7 +266,6 @@ void ConvolutionWinogradFP32Coder::CollectFilesForFunc(CoderContext *const conte
   } else if (target_ == kARM64) {
     Collect(context, {}, {},
             {
-              "BigMatmulFp32Opt.S",
               "MatmulFp32.S",
               "MatmulFp32Opt.S",
               "PreSum4x16Int8Peroc.S",
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.cc
new file mode 100644
index 00000000..57d7a5dd
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.cc
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.h"
+#include <string>
+#include "nnacl/gather_parameter.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/utils/coder_utils.h"
+#include "tools/common/string_util.h"
+
+using mindspore::schema::PrimitiveType_Gather;
+
+namespace mindspore::lite::micro::nnacl {
+int GatherDynamicFP32Coder::Prepare(CoderContext *const context) {
+  MS_CHECK_TRUE_MSG(input_tensors_.size() == C3NUM, RET_ERROR, "Gather's input-num must be 3.");
+  MS_CHECK_TRUE_MSG(input_tensors_[FIRST_INPUT]->IsConst() && input_tensors_[THIRD_INPUT]->IsConst(), RET_NOT_SUPPORT,
+                    "Currently, only support the second input of gather is non-const when shape is dynamical.");
+  MS_CHECK_TRUE_MSG(input_tensors_[THIRD_INPUT]->data_type() == kNumberTypeInt32 ||
+                      input_tensors_[THIRD_INPUT]->data_type() == kNumberTypeInt,
+                    RET_ERROR, "The data-type of Gather's third input must be int.");
+  auto axis = input_tensors_[THIRD_INPUT]->data();
+  MS_CHECK_TRUE_MSG(axis != nullptr, RET_NULL_PTR, "Gather has no axis.");
+  axis_ = *(static_cast<int *>(axis));
+  auto in_shape0 = input_tensors_[FIRST_INPUT]->shape();
+  axis_ = axis_ >= 0 ? axis_ : axis_ + static_cast<int>(in_shape0.size());
+  MS_CHECK_TRUE_MSG(axis_ >= 0 && axis_ < static_cast<int>(in_shape0.size()), RET_INPUT_TENSOR_ERROR,
+                    "Gather's axis is out of range.");
+  return RET_OK;
+}
+
+int GatherDynamicFP32Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/base/gather_base.h",
+          },
+          {
+            "gather_base.c",
+          });
+  auto in_shape0 = input_tensors_[FIRST_INPUT]->shape();
+  auto data_item_size = static_cast<int>(lite::DataTypeSize(input_tensors_[FIRST_INPUT]->data_type()));
+  int64_t out_size = 1;
+  for (size_t i = 0; i < static_cast<size_t>(axis_); ++i) {
+    out_size *= in_shape0[i];
+  }
+  int64_t byte_inner_size = data_item_size;
+  for (size_t i = axis_ + 1; i < in_shape0.size(); ++i) {
+    byte_inner_size *= in_shape0[i];
+  }
+  int64_t limit = in_shape0[axis_];
+  auto in_shape1 = shape_info_container_->GetTemplateShape(input_tensors_[SECOND_INPUT]);
+  int64_t const_part = 1;
+  std::string non_const_part;
+  for (const auto &item : in_shape1) {
+    if (IsNumber(item)) {
+      const_part *= std::stoi(item);
+    } else {
+      if (!non_const_part.empty()) {
+        non_const_part += " * ";
+      }
+      non_const_part += item;
+    }
+  }
+  std::string byte_out_stride_str = std::to_string(const_part * byte_inner_size);
+  std::string index_num_str = std::to_string(const_part);
+  if (!non_const_part.empty()) {
+    byte_out_stride_str += " * " + non_const_part;
+    index_num_str += " * " + non_const_part;
+  }
+  std::string input0_data = MemoryAllocator::GetInstance()->GetRuntimeAddr(input_tensors_[FIRST_INPUT], true);
+  MS_CHECK_TRUE_MSG(!input0_data.empty(), RET_ERROR, "pointer is not allocated by the allocator");
+  std::string input1_data = dynamic_mem_manager_->GetVarTensorAddr(input_tensors_[SECOND_INPUT]);
+  MS_CHECK_TRUE_MSG(!input1_data.empty(), RET_ERROR, "pointer is not allocated by the allocator");
+  std::string output_data = dynamic_mem_manager_->GetVarTensorAddr(output_tensors_[FIRST_INPUT]);
+  MS_CHECK_TRUE_MSG(!output_data.empty(), RET_ERROR, "pointer is not allocated by the allocator");
+  NNaclFp32Serializer code;
+  code << "\t\tconst int8_t *int8_in = (const int8_t *)(" << input0_data << ");\n";
+  code << "\t\tconst int *index_data = (const int *)(" << input1_data << ");\n";
+  code << "\t\tint8_t *int8_out = (int8_t *)(" << output_data << ");\n";
+  // call the op function
+  code.CodeFunction("Gather", "int8_in", out_size, byte_inner_size, limit, "index_data", index_num_str, "int8_out",
+                    byte_out_stride_str);
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Gather,
+                           CPUOpCoderCreator<GatherDynamicFP32Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Gather,
+                           CPUOpCoderCreator<GatherDynamicFP32Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Gather, CPUOpCoderCreator<GatherDynamicFP32Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Gather, CPUOpCoderCreator<GatherDynamicFP32Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.h
new file mode 100644
index 00000000..9e58e1fa
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_GATHER_DYNAMIC_FP32_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_GATHER_DYNAMIC_FP32_CODER_H_
+
+#include <string>
+#include <vector>
+#include "coder/opcoders/op_coder.h"
+#include "nnacl/base/tile_base.h"
+
+namespace mindspore::lite::micro::nnacl {
+class GatherDynamicFP32Coder final : public OperatorCoder {
+ public:
+  GatherDynamicFP32Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                         const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~GatherDynamicFP32Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ private:
+  int axis_{0};
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_GATHER_DYNAMIC_FP32_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.cc
new file mode 100644
index 00000000..4ec7f317
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.cc
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.h"
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/utils/coder_utils.h"
+#include "nnacl/op_base.h"
+
+using mindspore::schema::PrimitiveType_Split;
+
+namespace mindspore::lite::micro::nnacl {
+int SplitDynamicFP32Coder::Prepare(CoderContext *const context) {
+  auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  int in_shape_size = static_cast<int>(input_shape.size());
+  CHECK_LESS_RETURN(in_shape_size, 1);
+  CHECK_LESS_RETURN(SPLIT_STRIDES_SIZE - 1, in_shape_size);
+  param_ = reinterpret_cast<SplitParameter *>(parameter_);
+  CHECK_NULL_RETURN(param_);
+
+  auto split_dim = param_->split_dim_;
+  param_->split_dim_ = split_dim >= 0 ? split_dim : in_shape_size + split_dim;
+  std::vector<std::string> strides(in_shape_size);
+  strides[in_shape_size - 1] = "1";
+  for (int i = static_cast<int>(in_shape_size) - C2NUM; i >= 0; i--) {
+    strides[i] = strides[i + 1] + " * " + input_shape[i + 1];
+  }
+  dynamic_param_.strides_ = "{";
+  for (int i = 0; i < in_shape_size; ++i) {
+    dynamic_param_.strides_ += strides[i] + ", ";
+  }
+  dynamic_param_.strides_ += "}";
+  CHECK_LESS_RETURN(in_shape_size, param_->split_dim_ + 1);
+  if (input_shape.at(param_->split_dim_) == "0") {
+    MS_LOG(ERROR) << "input_shape[" << param_->split_dim_ << "] must not be zero!";
+    return RET_ERROR;
+  }
+  CHECK_LESS_RETURN(SPLIT_STRIDES_SIZE, param_->split_dim_ + 1);
+  if (strides[param_->split_dim_] == "0") {
+    MS_LOG(ERROR) << "strides[" << param_->split_dim_ << "] must not be zero!";
+    return RET_ERROR;
+  }
+  dynamic_param_.split_count_ = strides[0] + " * " + input_shape[0] + " / (" + input_shape.at(param_->split_dim_) +
+                                " * " + strides[param_->split_dim_] + ")";
+  param_->n_dims_ = static_cast<int>(input_shape.size());
+  CHECK_LESS_RETURN(param_->num_split_, 1);
+  MS_CHECK_TRUE_MSG(param_->split_sizes_[0] != 0 && param_->split_sizes_[param_->num_split_ - 1] != -1,
+                    lite::RET_PARAM_INVALID, "Currently, split not support split_size 0 or -1");
+  return RET_OK;
+}
+
+int SplitDynamicFP32Coder::DoCode(CoderContext *const context) {
+  Collect(context, {"nnacl/base/split_base.h"}, {"split_base.c"});
+  NNaclFp32Serializer code;
+  code << "    void *output_ptrs[" << output_tensors_.size() << "] = {";
+  for (int i = 0; i < param_->num_split_; i++) {
+    code << GetTensorAddr(output_tensors_.at(i), output_tensors_.at(i)->IsConst(), dynamic_mem_manager_, allocator_)
+         << ", ";
+  }
+  code << "};\n";
+  auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  code << "    int input_dim[" << input_shape.size() << "] = {";
+  for (auto &dim : input_shape) {
+    code << dim << ", ";
+  }
+  code << "};\n";
+  std::string input_data = GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_);
+  std::string num_unit = dynamic_param_.split_count_ + " * " + std::to_string(param_->num_split_);
+  code.CodeStruct("split_param", *param_, dynamic_param_);
+  code.CodeFunction("DoSplit", input_data, "output_ptrs", "input_dim", "0", num_unit, "&split_param",
+                    lite::DataTypeSize(input_tensor_->data_type()));
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Split,
+                           CPUOpCoderCreator<SplitDynamicFP32Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Split, CPUOpCoderCreator<SplitDynamicFP32Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat16, PrimitiveType_Split,
+                           CPUOpCoderCreator<SplitDynamicFP32Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.h
new file mode 100644
index 00000000..e3e64cb3
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_SPLIT_DYNAMIC_FP32_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_SPLIT_DYNAMIC_FP32_CODER_H_
+
+#include <vector>
+#include "coder/opcoders/op_coder.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h"
+#include "nnacl/split_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class SplitDynamicFP32Coder : public OperatorCoder {
+ public:
+  SplitDynamicFP32Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                        const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+  ~SplitDynamicFP32Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ protected:
+  SplitParameter *param_{nullptr};
+  SplitDynamicParameter dynamic_param_;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_SPLIT_DYNAMIC_FP32_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.cc
new file mode 100644
index 00000000..7fb160d5
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.cc
@@ -0,0 +1,171 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h"
+#include <vector>
+#include <unordered_set>
+#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h"
+#include "coder/opcoders/file_collector.h"
+#include "coder/opcoders/parallel.h"
+#include "coder/utils/coder_utils.h"
+
+using mindspore::schema::PrimitiveType_Transpose;
+namespace mindspore::lite::micro::nnacl {
+int TransposeDynamicFp32Coder::Prepare(CoderContext *const context) {
+  MS_CHECK_TRUE_MSG(input_tensor_->data_type() == kNumberTypeInt32 || input_tensor_->data_type() == kNumberTypeFloat32,
+                    RET_INPUT_PARAM_INVALID, "Input tensor data type is invalid.");
+  MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt32, RET_INPUT_PARAM_INVALID,
+                    "Perm tensor data type is invalid.");
+  MS_CHECK_TRUE_MSG(
+    output_tensor_->data_type() == kNumberTypeInt32 || output_tensor_->data_type() == kNumberTypeFloat32,
+    RET_INPUT_PARAM_INVALID, "Output tensor data type is invalid.");
+  MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst(), RET_NOT_SUPPORT,
+                    "The second input of transpose is non-const.");
+  thread_num_ = 1;
+  MS_CHECK_RET_CODE(Init(), "init failed");
+  return RET_OK;
+}
+
+int TransposeDynamicFp32Coder::DoCode(CoderContext *const context) {
+  Collect(context,
+          {
+            "nnacl/transpose_parameter.h",
+            "nnacl/errorcode.h",
+            "nnacl/fp32/transpose_fp32.h",
+          },
+          {
+            "transpose_fp32.c",
+          });
+
+  NNaclFp32Serializer code;
+  dims_ = static_cast<int>(out_shapes_.size());
+  code << "const int32_t output_shape[" << dims_ << "] = {";
+  for (size_t i = 0; i < out_shapes_.size(); ++i) {
+    code << out_shapes_[i] << ", ";
+  }
+  code << "};\n";
+  code.CodeStruct("trans_param", *param_, dynamic_param_);
+  auto input_str = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_);
+  auto output_str = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_);
+  if (param_->num_axes_ > DIMENSION_6D) {
+    code.CodeFunction("TransposeDimsFp32", input_str, output_str, "output_shape", "trans_param.perm_",
+                      "trans_param.strides_", "trans_param.out_strides_", "trans_param.num_axes_", kDefaultTaskId,
+                      kDefaultThreadNum);
+  } else {
+    code.CodeFunction("DoTransposeFp32", input_str, output_str, "output_shape", "trans_param.perm_",
+                      "trans_param.strides_", "trans_param.out_strides_", "trans_param.data_num_",
+                      "trans_param.num_axes_");
+  }
+  context->AppendCode(code.str());
+  return RET_OK;
+}
+
+int TransposeDynamicFp32Coder::Init() {
+  param_ = reinterpret_cast<TransposeParameter *>(parameter_);
+  MS_CHECK_PTR(param_);
+  param_->num_axes_ = 0;
+  if (input_tensors_.size() == C2NUM) {
+    param_->num_axes_ = input_tensors_[SECOND_INPUT]->ElementsNum();
+  }
+  if (input_tensor_->shape().size() != static_cast<size_t>(param_->num_axes_)) {
+    return RET_OK;
+  }
+  // get perm data
+  auto ret = ResetStatus();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Do transpose reset failed.";
+    return ret;
+  }
+
+  ret = ComputeOfflineInfo();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Do compute transpose offline info failed.";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int TransposeDynamicFp32Coder::ResetStatus() {
+  auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_);
+  if (in_shape.size() > MAX_TRANSPOSE_DIM_SIZE) {
+    MS_LOG(ERROR) << "input shape out of range.";
+    return RET_ERROR;
+  }
+  int trans_nd[MAX_TRANSPOSE_DIM_SIZE] = {0, 2, 1};
+  int *perm_data{nullptr};
+  if (in_shape.size() != static_cast<size_t>(param_->num_axes_)) {
+    perm_data = trans_nd;
+    if (in_shape.size() == C3NUM && param_->num_axes_ == C4NUM) {
+      param_->num_axes_ = C3NUM;
+    }
+    if (param_->num_axes_ == 0) {
+      for (int i = 0; i < static_cast<int>(in_shape.size()); ++i) {
+        trans_nd[i] = static_cast<int>(in_shape.size()) - 1 - i;
+      }
+      param_->num_axes_ = static_cast<int>(in_shape.size());
+    }
+  } else {
+    if (input_tensors_.size() != C2NUM) {
+      MS_LOG(ERROR) << "input tensors size is not equal to 2.";
+      return RET_ERROR;
+    }
+    auto perm_tensor = input_tensors_.at(SECOND_INPUT);
+    perm_data = reinterpret_cast<int *>(perm_tensor->data());
+    MSLITE_CHECK_PTR(perm_data);
+    std::vector<int> perm(perm_data, perm_data + input_tensors_[SECOND_INPUT]->ElementsNum());
+    if (perm.size() != std::unordered_set<int>(perm.cbegin(), perm.cend()).size()) {
+      MS_LOG(ERROR) << "Invalid perm, the same element exits in perm.";
+      return RET_ERROR;
+    }
+  }
+  MS_CHECK_TRUE_MSG(param_->num_axes_ <= MAX_TRANSPOSE_DIM_SIZE, RET_ERROR, "transpose perm is invalid.");
+  for (int i = 0; i < param_->num_axes_; ++i) {
+    param_->perm_[i] = perm_data[i];
+  }
+  return RET_OK;
+}
+
+int TransposeDynamicFp32Coder::ComputeOfflineInfo() {
+  in_shapes_ = shape_info_container_->GetTemplateShape(input_tensor_);
+  out_shapes_ = shape_info_container_->GetTemplateShape(output_tensor_);
+  const int ori_stride = 1;
+  dynamic_param_.strides_ = std::to_string(ori_stride) + ", ";
+  dynamic_param_.out_strides_ = std::to_string(ori_stride) + ", ";
+  dynamic_param_.data_num_ = AccumulateShape(in_shapes_, 0, in_shapes_.size());
+  std::vector<std::string> strides(param_->num_axes_);
+  std::vector<std::string> out_strides(param_->num_axes_);
+  strides[param_->num_axes_ - 1] = "1";
+  out_strides[param_->num_axes_ - 1] = "1";
+  for (int i = param_->num_axes_ - C2NUM; i >= 0; --i) {
+    strides[i] = in_shapes_[i + 1] + " * " + strides[i + 1];
+    out_strides[i] = out_shapes_[i + 1] + " * " + out_strides[i + 1];
+  }
+  dynamic_param_.strides_ = "{";
+  dynamic_param_.out_strides_ = "{";
+  for (int i = 0; i < param_->num_axes_; ++i) {
+    dynamic_param_.strides_ += strides[i] + ", ";
+    dynamic_param_.out_strides_ += out_strides[i] + ", ";
+  }
+  dynamic_param_.strides_ += "}";
+  dynamic_param_.out_strides_ += "}";
+  return RET_OK;
+}
+
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat32, PrimitiveType_Transpose,
+                           CPUOpCoderCreator<TransposeDynamicFp32Coder>)
+REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeInt32, PrimitiveType_Transpose,
+                           CPUOpCoderCreator<TransposeDynamicFp32Coder>)
+}  // namespace mindspore::lite::micro::nnacl
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h
new file mode 100644
index 00000000..9230b8e3
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_TRANSPOSE_DYNAMIC_FP32_CODER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_TRANSPOSE_DYNAMIC_FP32_CODER_H_
+#include <vector>
+#include <string>
+#include "coder/opcoders/op_coder.h"
+#include "nnacl/transpose_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h"
+
+namespace mindspore::lite::micro::nnacl {
+class TransposeDynamicFp32Coder : public OperatorCoder {
+ public:
+  TransposeDynamicFp32Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
+                            const LiteGraph::Node *node, size_t node_index, Target target)
+      : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {}
+
+  ~TransposeDynamicFp32Coder() override = default;
+
+  int Prepare(CoderContext *const context) override;
+
+  int DoCode(CoderContext *const context) override;
+
+ protected:
+  int Init();
+  int ResetStatus();
+  int ComputeOfflineInfo();
+  TransposeParameter *param_{nullptr};
+  TransposeDynamicParameter dynamic_param_;
+  int dims_{0};
+  std::vector<std::string> in_shapes_;
+  std::vector<std::string> out_shapes_;
+};
+}  // namespace mindspore::lite::micro::nnacl
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_TRANSPOSE_DYNAMIC_FP32_CODER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h
index dffaf14b..fa59e483 100644
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h
@@ -28,6 +28,8 @@
 #include "securec/include/securec.h"
 #include "tools/converter/micro/coder/opcoders/op_coder_register.h"
 #include "tools/converter/micro/coder/log.h"
+#include "tools/converter/micro/coder/shape_info_container.h"
+#include "tools/converter/micro/coder/dynamic_mem_manager.h"

 namespace mindspore::lite::micro {
 constexpr int kPrecision = 19;
@@ -71,6 +73,8 @@ class OperatorCoder {

   void set_parameter(OpParameter *parameter);

+  OpParameter *get_parameter() const { return parameter_; }
+
   const LiteGraph::Node *node() const { return this->node_; }

   void AddInitialParameters(Tensor *parameter) { initial_parameters_.push_back(parameter); }
@@ -88,6 +92,12 @@ class OperatorCoder {

   void set_thread_num(int thread_num);

+  void set_shape_info_container(ShapeInfoContainer *shape_info_container) {
+    shape_info_container_ = shape_info_container;
+  }
+
+  void set_dynamic_mem_manager(DynamicMemManager *dynamic_mem_manager) { dynamic_mem_manager_ = dynamic_mem_manager; }
+
  protected:
   std::vector<Tensor *> input_tensors_;
   std::vector<Tensor *> output_tensors_;
@@ -103,6 +113,8 @@ class OperatorCoder {
   bool support_parallel_{false};
   int thread_num_{1};
   int schema_version_ = lite::SCHEMA_VERSION::SCHEMA_CUR;
+  ShapeInfoContainer *shape_info_container_{nullptr};
+  DynamicMemManager *dynamic_mem_manager_{nullptr};

  private:
   size_t node_index_{0};
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.cc
index 45b2e37f..e2d70c12 100644
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.cc
@@ -35,7 +35,7 @@ std::unique_ptr<OperatorCoder> OpCoderBuilder::build(int schema_version) {
     }
     coder_key = CoderKey(target_, data_type_, schema::PrimitiveType_Custom, custom_type->str());
   }
-  CoderCreatorFunc creator_func = OpCoderFactory::GetInstance()->FindOpCoder(coder_key);
+  CoderCreatorFunc creator_func = OpCoderFactory::GetInstance()->FindOpCoder(coder_key, dynamic_);
   if (creator_func == nullptr) {
     MS_LOG(ERROR) << "caught unsupported layer: " << node_->name_;
     return nullptr;
@@ -125,5 +125,10 @@ OpCoderBuilder &OpCoderBuilder::is_builtin_custom(bool builtin_custom) {
   return *this;
 }

+OpCoderBuilder &OpCoderBuilder::is_dynamic(bool dynamic) {
+  dynamic_ = dynamic;
+  return *this;
+}
+
 void OpCoderBuilder::Reset() {}
 }  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.h
index d85f1c32..bdd815ef 100644
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.h
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.h
@@ -48,6 +48,8 @@ class OpCoderBuilder {

   OpCoderBuilder &is_builtin_custom(bool builtin_custom);

+  OpCoderBuilder &is_dynamic(bool dynamic);
+
   void Reset();

  private:
@@ -74,6 +76,8 @@ class OpCoderBuilder {
   bool support_parallel_{false};

   bool builtin_custom_{false};
+
+  bool dynamic_{false};
 };
 }  // namespace mindspore::lite::micro
 #endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_OP_CODER_BUILDER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.cc
index cf26d51d..1dac9c73 100644
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.cc
@@ -37,33 +37,38 @@ OpCoderFactory *OpCoderFactory::GetInstance() {
 }

 int OpCoderFactory::RegistOpCoder(Target target, TypeId data_type, schema::PrimitiveType operator_type,
-                                  const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func) {
+                                  const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func,
+                                  bool dynamic) {
+  auto &op_sets = dynamic ? dynamic_opcoder_sets_ : static_opcoder_sets_;
   // check key
   CoderKey key(target, data_type, operator_type, builtin_custom_type);
   // insert pair to registry
-  if (this->opcoder_sets_.find(key) != this->opcoder_sets_.end()) {
+  if (op_sets.find(key) != op_sets.end()) {
     MS_LOG(ERROR) << "coder already exist: " << key.ToString();
     return RET_ERROR;
   }
-  this->opcoder_sets_.insert(std::pair<CoderKey, CoderCreatorFunc>(key, creator_func));
+  op_sets.insert(std::pair<CoderKey, CoderCreatorFunc>(key, creator_func));
   return RET_OK;
 }

-CoderCreatorFunc OpCoderFactory::FindOpCoder(const CoderKey &key) {
-  auto iterator = this->opcoder_sets_.find(key);
-  if (iterator != this->opcoder_sets_.end()) {
+CoderCreatorFunc OpCoderFactory::FindOpCoder(const CoderKey &key, bool dynamic) {
+  const auto &op_sets = dynamic ? dynamic_opcoder_sets_ : static_opcoder_sets_;
+  auto iterator = op_sets.find(key);
+  if (iterator != op_sets.end()) {
     return iterator->second;
   }
   // matching kAllTargets
-  iterator = this->opcoder_sets_.find(key.AllKey());
-  if (iterator != this->opcoder_sets_.end()) {
+  iterator = op_sets.find(key.AllKey());
+  if (iterator != op_sets.end()) {
     return iterator->second;
   }
   return nullptr;
 }

 OpCoderRegister::OpCoderRegister(Target target, TypeId data_type, schema::PrimitiveType operator_type,
-                                 const std::string &builtin_custom_type, const CoderCreatorFunc &creatorFunc) {
-  OpCoderFactory::GetInstance()->RegistOpCoder(target, data_type, operator_type, builtin_custom_type, creatorFunc);
+                                 const std::string &builtin_custom_type, const CoderCreatorFunc &creatorFunc,
+                                 bool dynamic) {
+  OpCoderFactory::GetInstance()->RegistOpCoder(target, data_type, operator_type, builtin_custom_type, creatorFunc,
+                                               dynamic);
 }
 }  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.h b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.h
index 30c8a64d..b616e287 100644
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.h
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.h
@@ -65,15 +65,19 @@ class OpCoderFactory {
   static OpCoderFactory *GetInstance();

   int RegistOpCoder(Target target, TypeId data_type, schema::PrimitiveType operator_type,
-                    const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func);
+                    const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func, bool dynamic);

-  CoderCreatorFunc FindOpCoder(const CoderKey &key);
+  CoderCreatorFunc FindOpCoder(const CoderKey &key, bool dynamic = false);

-  ~OpCoderFactory() { opcoder_sets_.clear(); }
+  ~OpCoderFactory() {
+    static_opcoder_sets_.clear();
+    dynamic_opcoder_sets_.clear();
+  }

  private:
   // target || data type || primitive type
-  std::map<CoderKey, CoderCreatorFunc> opcoder_sets_;
+  std::map<CoderKey, CoderCreatorFunc> static_opcoder_sets_;
+  std::map<CoderKey, CoderCreatorFunc> dynamic_opcoder_sets_;
 };

 class OpCoderRegister {
@@ -81,16 +85,20 @@ class OpCoderRegister {
   OpCoderRegister() = delete;

   OpCoderRegister(Target target, TypeId data_type, schema::PrimitiveType operator_type,
-                  const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func);
+                  const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func, bool dynamic = false);

   ~OpCoderRegister() = default;
 };
-#define REG_OPERATOR_CODER(target, data_type, operator_type, creator_func)                                   \
-  static OpCoderRegister g_##target##data_type##operator_type##Creator(target, data_type, operator_type, "", \
-                                                                       creator_func);
+#define REG_OPERATOR_CODER(target, data_type, operator_type, creator_func)                                         \
+  static OpCoderRegister g_##target##data_type##operator_type##StaticCreator(target, data_type, operator_type, "", \
+                                                                             creator_func);

 #define REG_BUILIN_CUSTOM_CODER(target, data_type, custom_type, creator_func) \
   static OpCoderRegister g_##target##data_type##operator_type##Creator(       \
     target, data_type, schema::PrimitiveType_Custom, custom_type, creator_func);
+
+#define REG_DYNAMIC_OPERATOR_CODER(target, data_type, operator_type, creator_func)                                  \
+  static OpCoderRegister g_##target##data_type##operator_type##DynamicCreator(target, data_type, operator_type, "", \
+                                                                              creator_func, true);
 }  // namespace mindspore::lite::micro
 #endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_OP_CODER_REGISTER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc
index a3743b48..920f2723 100644
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc
@@ -38,6 +38,15 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const PoolingCompu
                         pooling_compute.maxf);
 }

+void NNaclFp32Serializer::CodeStruct(const std::string &name, const PoolingComputeParam &pooling_compute,
+                                     const PoolingDynamicParameter &dynamic_pooling_param) {
+  CodeBaseStruct<false>("PoolingComputeParam", name, pooling_compute.input_w_, pooling_compute.input_h_,
+                        dynamic_pooling_param.input_batch_, pooling_compute.input_channel_, pooling_compute.output_w_,
+                        pooling_compute.output_h_, dynamic_pooling_param.output_batch_, pooling_compute.output_channel_,
+                        pooling_compute.window_w_, pooling_compute.window_h_, pooling_compute.minf,
+                        pooling_compute.maxf);
+}
+
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const BatchNormParameter &batch_norm_parameter) {
   CodeBaseStruct("BatchNormParameter", name, batch_norm_parameter.op_parameter_, batch_norm_parameter.epsilon_,
                  batch_norm_parameter.momentum_, batch_norm_parameter.unit_, batch_norm_parameter.units_,
@@ -85,6 +94,29 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const ConvParamete
     conv_parameter.output_padding_w_, conv_parameter.output_padding_h_);
 }

+void NNaclFp32Serializer::CodeStruct(const std::string &name, const ConvParameter &conv_parameter,
+                                     const ConvDynamicParameter &dynamic_conv_param) {
+  CodeBaseStruct<false>(
+    "ConvParameter", name, conv_parameter.op_parameter_, "{0}", conv_parameter.kernel_h_, conv_parameter.kernel_w_,
+    conv_parameter.stride_h_, conv_parameter.stride_w_, conv_parameter.dilation_h_, conv_parameter.dilation_w_,
+    conv_parameter.pad_u_, conv_parameter.pad_d_, conv_parameter.pad_l_, conv_parameter.pad_r_, conv_parameter.group_,
+    conv_parameter.tile_num_, dynamic_conv_param.input_batch_, conv_parameter.input_h_, conv_parameter.input_w_,
+    conv_parameter.input_channel_, dynamic_conv_param.output_batch_, conv_parameter.output_h_, conv_parameter.output_w_,
+    conv_parameter.output_channel_, conv_parameter.thread_num_, conv_parameter.input_unit_, conv_parameter.output_unit_,
+    conv_parameter.pad_mode_, conv_parameter.act_type_, conv_parameter.channel_multiplie_,
+    conv_parameter.output_padding_w_, conv_parameter.output_padding_h_);
+}
+
+void NNaclFp32Serializer::CodeStruct(const std::string &name, const MatMulParameter &mat_mul_parameter) {
+  CodeBaseStruct<false>(
+    "MatMulParameter", name, mat_mul_parameter.op_parameter_, mat_mul_parameter.has_bias_, mat_mul_parameter.use_axis_,
+    mat_mul_parameter.a_transpose_, mat_mul_parameter.b_transpose_, mat_mul_parameter.act_type_, mat_mul_parameter.row_,
+    mat_mul_parameter.col_, mat_mul_parameter.row_4_, mat_mul_parameter.row_16_, mat_mul_parameter.row_align_,
+    mat_mul_parameter.col_8_, mat_mul_parameter.col_align_, mat_mul_parameter.deep_, mat_mul_parameter.deep_4_,
+    mat_mul_parameter.deep_16_, mat_mul_parameter.deep_align_, mat_mul_parameter.batch, mat_mul_parameter.a_const_,
+    mat_mul_parameter.b_const_, mat_mul_parameter.axis_, mat_mul_parameter.matmul_type_);
+}
+
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const MicroMatmulParameter &micro_matmul_parameter) {
   CodeBaseStruct<false>("MicroMatmulParameter", name, micro_matmul_parameter.act_type_,
                         micro_matmul_parameter.thread_num_, micro_matmul_parameter.row_, micro_matmul_parameter.col_,
@@ -102,18 +134,41 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const ScaleStruct
                         scale_struct.outer_size_, scale_struct.inner_size_);
 }

+void NNaclFp32Serializer::CodeStruct(const std::string &name, const ScaleStruct &scale_struct,
+                                     const ScaleDynamicParameter &dynamic_scale_param) {
+  CodeBaseStruct<false>("ScaleStruct", name, "{}", scale_struct.axis_, scale_struct.data_type_,
+                        dynamic_scale_param.axis_size_, dynamic_scale_param.outer_size_,
+                        dynamic_scale_param.inner_size_);
+}
+
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const SliceParameter &slice_parameter) {
   CodeBaseStruct("SliceParameter", name, slice_parameter.op_parameter_, ToString(slice_parameter.shape_),
                  ToString(slice_parameter.begin_), ToString(slice_parameter.end_), ToString(slice_parameter.size_),
                  "{0}", slice_parameter.param_length_);
 }

+void NNaclFp32Serializer::CodeStruct(const std::string &name, const SliceParameter &slice_parameter,
+                                     const SliceDynamicParameter &dynamic_slice_param) {
+  CodeBaseStruct<false>("SliceParameter", name, slice_parameter.op_parameter_, dynamic_slice_param.shape_,
+                        ToString(slice_parameter.begin_), dynamic_slice_param.end_, dynamic_slice_param.size_, "{0}",
+                        slice_parameter.param_length_);
+}
+
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const SplitParameter &split_parameter) {
   CodeBaseStruct("SplitParameter", name, split_parameter.op_parameter_, split_parameter.num_split_, "split_sizes",
                  split_parameter.split_dim_, ToString(split_parameter.strides_), "{0}", split_parameter.n_dims_,
                  split_parameter.split_count_);
 }

+void NNaclFp32Serializer::CodeStruct(const std::string &name, const SplitParameter &split_parameter,
+                                     const SplitDynamicParameter &dynamic_split_param) {
+  CodeArray("split_sizes", split_parameter.split_sizes_, split_parameter.num_split_, false);
+  CodeBaseStruct<false>("SplitParameter", name, split_parameter.op_parameter_, split_parameter.num_split_, nullptr,
+                        split_parameter.split_dim_, dynamic_split_param.strides_, "{0}", split_parameter.n_dims_,
+                        dynamic_split_param.split_count_);
+  code << "    " << name << ".split_sizes_ = split_sizes;\n";
+}
+
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const TileParameter &tile_parameter) {
   CodeBaseStruct("TileParameter", name, tile_parameter.op_parameter_, ToString(tile_parameter.multiples_),
                  ToString(tile_parameter.in_shape_), ToString(tile_parameter.out_shape_),
@@ -127,12 +182,32 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const TransposePar
     ToString(transpose_parameter.out_strides_), transpose_parameter.num_axes_, transpose_parameter.data_num_);
 }

+void NNaclFp32Serializer::CodeStruct(const std::string &name, const TransposeParameter &transpose_param,
+                                     const TransposeDynamicParameter &dynamic_transpose_param) {
+  CodeBaseStruct<false>("TransposeParameter", name, transpose_param.op_parameter_, ToString(transpose_param.perm_),
+                        transpose_param.perm_size_, transpose_param.conjugate_, dynamic_transpose_param.strides_,
+                        dynamic_transpose_param.out_strides_, transpose_param.num_axes_,
+                        dynamic_transpose_param.data_num_);
+}
+
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const LstmParameter &lstm_parameter) {
   CodeBaseStruct("LstmParameter", name, lstm_parameter.op_parameter_, lstm_parameter.input_size_,
-                 lstm_parameter.hidden_size_, lstm_parameter.project_size_, lstm_parameter.seq_len_,
-                 lstm_parameter.batch_, lstm_parameter.output_step_, lstm_parameter.bidirectional_,
-                 lstm_parameter.zoneout_cell_, lstm_parameter.zoneout_hidden_, lstm_parameter.input_row_align_,
-                 lstm_parameter.input_col_align_, lstm_parameter.state_row_align_, lstm_parameter.state_col_align_);
+                 lstm_parameter.hidden_size_, lstm_parameter.project_size_, lstm_parameter.output_size_,
+                 lstm_parameter.seq_len_, lstm_parameter.batch_, lstm_parameter.output_step_,
+                 lstm_parameter.bidirectional_, lstm_parameter.zoneout_cell_, lstm_parameter.zoneout_hidden_,
+                 lstm_parameter.input_row_align_, lstm_parameter.input_col_align_, lstm_parameter.state_row_align_,
+                 lstm_parameter.state_col_align_, lstm_parameter.proj_col_align_, lstm_parameter.has_bias_);
+}
+
+void NNaclFp32Serializer::CodeStruct(const std::string &name, const LstmParameter &lstm_parameter,
+                                     const DynamicLstmParameter &dynamic_lstm_param) {
+  CodeBaseStruct("LstmParameter", name, lstm_parameter.op_parameter_, lstm_parameter.input_size_,
+                 lstm_parameter.hidden_size_, lstm_parameter.project_size_, lstm_parameter.output_size_,
+                 dynamic_lstm_param.seq_len_, dynamic_lstm_param.batch_, dynamic_lstm_param.output_step_,
+                 lstm_parameter.bidirectional_, lstm_parameter.zoneout_cell_, lstm_parameter.zoneout_hidden_,
+                 dynamic_lstm_param.input_row_align_, lstm_parameter.input_col_align_,
+                 dynamic_lstm_param.state_row_align_, lstm_parameter.state_col_align_, lstm_parameter.proj_col_align_,
+                 lstm_parameter.has_bias_);
 }

 void NNaclFp32Serializer::CodeStruct(const std::string &name, const DeQuantArg &de_quant_arg) {
@@ -165,6 +240,17 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const StridedSlice
                  strided_slice_parameter.newAxisMask_, strided_slice_parameter.shrinkAxisMask_);
 }

+void NNaclFp32Serializer::CodeStruct(const std::string &name, const StridedSliceParameter &strided_slice_parameter,
+                                     const StridedSliceDynamicParameter &dynamic_strided_slice_param) {
+  CodeBaseStruct<false>("StridedSliceParameter", name, strided_slice_parameter.op_parameter_,
+                        ToString(strided_slice_parameter.begins_), ToString(strided_slice_parameter.ends_),
+                        ToString(strided_slice_parameter.strides_), strided_slice_parameter.isScale,
+                        strided_slice_parameter.in_shape_length_, dynamic_strided_slice_param.in_shape_,
+                        strided_slice_parameter.num_axes_, strided_slice_parameter.data_type,
+                        strided_slice_parameter.begins_mask_, strided_slice_parameter.ellipsisMask_,
+                        strided_slice_parameter.newAxisMask_, strided_slice_parameter.shrinkAxisMask_);
+}
+
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const ArithmeticWrapperInfo &arithmetic_wrapper_info) {
   CodeBaseStruct("ArithmeticWrapperInfo", name, arithmetic_wrapper_info.offset0_, arithmetic_wrapper_info.stride0_,
                  arithmetic_wrapper_info.offset1_, arithmetic_wrapper_info.stride1_,
@@ -207,6 +293,12 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const BroadcastSha
                         ToString(param.output_shape_), param.output_shape_size_);
 }

+void NNaclFp32Serializer::CodeStruct(const std::string &name, const BroadcastShapeInfo &op_param,
+                                     const BroadcastDynamicShapeInfo &dynamic_param) {
+  CodeBaseStruct<false>("BroadcastShapeInfo", name, dynamic_param.input_shape_, op_param.input_shape_size_,
+                        dynamic_param.output_shape_, op_param.output_shape_size_);
+}
+
 void NNaclFp32Serializer::CodeStruct(const std::string &name, const CustomGruParameter &op_param) {
   CodeBaseStruct<false>("CustomGruParameter", name, op_param.op_parameter_, op_param.num_step, op_param.batch_size,
                         op_param.input_size, op_param.hidden_size);
diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h b/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h
index d1435dea..2b1536c6 100644
--- a/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h
+++ b/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h
@@ -53,6 +53,15 @@
 #include "nnacl/kernel/pooling.h"
 #include "nnacl/kernel/layer_norm.h"
 #include "nnacl/kernel/fill.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h"
+#include "coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h"

 namespace mindspore::lite::micro::nnacl {
 class NNaclFp32Serializer : public Serializer {
@@ -66,6 +75,7 @@ class NNaclFp32Serializer : public Serializer {
   void CodeStruct(const std::string &name, const InstanceNormParameter &param);
   void CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter);
   void CodeStruct(const std::string &name, const ConvParameter &conv_parameter);
+  void CodeStruct(const std::string &name, const MatMulParameter &mat_mul_parameter);
   void CodeStruct(const std::string &name, const MicroMatmulParameter &micro_matmul_parameter);
   void CodeStruct(const std::string &name, const LstmParameter &lstm_parameter);
   void CodeStruct(const std::string &name, const ScaleStruct &scale_struct);
@@ -89,6 +99,24 @@ class NNaclFp32Serializer : public Serializer {
   void CodeStruct(const std::string &name, const SlidingWindowParam &param);
   void CodeStruct(const std::string &name, const UnstackParameter &param);
   void CodeStruct(const std::string &name, const FillStruct &param);
+  void CodeStruct(const std::string &name, const TransposeParameter &transpose_param,
+                  const TransposeDynamicParameter &dynamic_transpose_param);
+  void CodeStruct(const std::string &name, const SplitParameter &split_parameter,
+                  const SplitDynamicParameter &dynamic_split_param);
+  void CodeStruct(const std::string &name, const BroadcastShapeInfo &param,
+                  const BroadcastDynamicShapeInfo &dynamic_param);
+  void CodeStruct(const std::string &name, const LstmParameter &lstm_param,
+                  const DynamicLstmParameter &dynamic_lstm_param);
+  void CodeStruct(const std::string &name, const SliceParameter &slice_parameter,
+                  const SliceDynamicParameter &dynamic_slice_param);
+  void CodeStruct(const std::string &name, const StridedSliceParameter &strided_slice_parameter,
+                  const StridedSliceDynamicParameter &dynamic_strided_slice_param);
+  void CodeStruct(const std::string &name, const ScaleStruct &scale_struct,
+                  const ScaleDynamicParameter &dynamic_scale_param);
+  void CodeStruct(const std::string &name, const ConvParameter &conv_parameter,
+                  const ConvDynamicParameter &dynamic_conv_param);
+  void CodeStruct(const std::string &name, const PoolingComputeParam &pooling_compute,
+                  const PoolingDynamicParameter &dynamic_pooling_param);
   void CodeStruct(const std::string &name, const int *list, int size);
   void CodeArrayStruct(const std::string &name, TensorC *tensorC, std::vector<Tensor *> tensor);

diff --git a/mindspore/lite/tools/converter/micro/coder/session.cc b/mindspore/lite/tools/converter/micro/coder/session.cc
index 55df7a22..374f662d 100644
--- a/mindspore/lite/tools/converter/micro/coder/session.cc
+++ b/mindspore/lite/tools/converter/micro/coder/session.cc
@@ -75,7 +75,10 @@ int CoderSession::PassArgsToContext(const std::string &model_name) {
   context_->set_total_buffer_size(final_total_size);
   context_->set_graph_inputs(coder_graph_->input_tensors());
   context_->set_graph_outputs(coder_graph_->output_tensors());
-  if (Configurator::GetInstance()->debug_mode()) {
+  context_->set_shape_info_container(&shape_info_container_);
+  context_->set_dynamic_mem_manager(&dynamic_mem_manager_);
+  Configurator *config = Configurator::GetInstance();
+  if (config->debug_mode()) {
     std::vector<std::string> blocks;
     blocks = AddDumpDataInfo(context_->code_blocks(), op_coders_);
     if (blocks.size() == 0) {
@@ -100,7 +103,16 @@ int CoderSession::Preprocess() {
                                Configurator::GetInstance()->changeable_weights_name());
   MS_CHECK_RET_CODE(ret, "assign memory failed");

-  // prepare, init model parameters
+  if (dynamic_) {
+    auto config = Configurator::GetInstance();
+    MS_CHECK_TRUE_MSG(config != nullptr, RET_NULL_PTR, "Config is a nullptr.");
+    ret = shape_info_container_.Init(op_coders_, graph_inputs_shape_infos_);
+    MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Init ShapeInfoContainer failed.");
+    auto outputs = coder_graph_->output_tensors();
+    ret = dynamic_mem_manager_.AllocDynamicMem(op_coders_, inputs, outputs, &shape_info_container_);
+    MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "DynamicMemManager AllocDynamicMem failed.");
+  }
+  // 2. prepare, init model parameters
   for (const auto &op_coder : op_coders_) {
     MS_CHECK_PTR(op_coder);
     MS_LOG(DEBUG) << "prepare: " << op_coder->name();
@@ -133,7 +145,7 @@ int CoderSession::Run(const std::string &model_name) {
   ret = PassArgsToContext(model_name);
   MS_CHECK_RET_CODE(ret, "PassArgsToContext failed");
   MS_LOG(INFO) << "run opcoders success";
-  return RET_OK;
+  return ret;
 }

 int CoderSession::GenerateCode() {
@@ -161,6 +173,9 @@ int CoderSession::Init(const void *content, int size, const int model_index, boo
   context_ = std::make_unique<CoderContext>(model_index);
   context_->set_end_flag(end_flag);
   enable_fp16_ = enable_fp16;
+  Configurator *config = Configurator::GetInstance();
+  MS_CHECK_TRUE_MSG(config != nullptr, RET_NULL_PTR, "Config is a nullptr.");
+  dynamic_ = !config->graph_inputs_shape_infos().empty();
   MS_LOG(INFO) << "CoderSession::Init done";
   return RET_OK;
 }
@@ -227,6 +242,7 @@ int CoderSession::InitTensorsRef() {
       }
     }
     tensor->set_ref_count(refcount);
+    tensor->set_init_ref_count(refcount);
   }
   return RET_OK;
 }
@@ -325,6 +341,7 @@ int CoderSession::CreateOpCoders() {
                                                 .input_indices(input_indices)
                                                 .output_indices(output_indices)
                                                 .is_builtin_custom(is_built_in_custom_op)
+                                                .is_dynamic(dynamic_)
                                                 .build(schema_version_);
     if (op_coder == nullptr) {
       coder_graph_->DumpUnSupportLayer(code_target);
@@ -348,6 +365,20 @@ int CoderSession::CompileGraph() {
   MS_CHECK_RET_CODE(InitCodeGraph(), "InitGraphInOutTensors failed");
   MS_CHECK_RET_CODE(CreateOpCoders(), "CreateOpCoders failed!");
   MS_CHECK_RET_CODE(InitTensorsRef(), "InitTensorsRefcount failed!");
+  if (dynamic_) {
+    Configurator::GetInstance()->set_dynamic_shape(true);
+    std::vector<lite::Tensor *> inputs = coder_graph_->input_tensors();
+    auto &graph_inputs_shape_infos = Configurator::GetInstance()->graph_inputs_shape_infos();
+    MS_CHECK_TRUE_MSG(inputs.size() == graph_inputs_shape_infos.size(), RET_ERROR,
+                      "Config graph_inputs_shape's num cannot match.");
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      graph_inputs_shape_infos_[inputs[i]] = graph_inputs_shape_infos[i];
+    }
+  }
+  for (auto &op_coder : op_coders_) {
+    op_coder->set_shape_info_container(&shape_info_container_);
+    op_coder->set_dynamic_mem_manager(&dynamic_mem_manager_);
+  }
   return RET_OK;
 }
 CoderSession::~CoderSession() { allocator_->Free(); }
diff --git a/mindspore/lite/tools/converter/micro/coder/session.h b/mindspore/lite/tools/converter/micro/coder/session.h
index 98a8d008..452e3245 100644
--- a/mindspore/lite/tools/converter/micro/coder/session.h
+++ b/mindspore/lite/tools/converter/micro/coder/session.h
@@ -65,6 +65,10 @@ class CoderSession {
  private:
   int schema_version_ = SCHEMA_VERSION::SCHEMA_CUR;
   bool enable_fp16_{false};
+  bool dynamic_{false};
+  DynamicMemManager dynamic_mem_manager_;
+  ShapeInfoContainer shape_info_container_;
+  std::map<Tensor *, std::vector<std::vector<int>>> graph_inputs_shape_infos_;
 };
 }  // namespace mindspore::lite::micro
 #endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_SESSION_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/shape_info_container.cc b/mindspore/lite/tools/converter/micro/coder/shape_info_container.cc
new file mode 100644
index 00000000..c914be6c
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/shape_info_container.cc
@@ -0,0 +1,131 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "coder/shape_info_container.h"
+#include "src/litert/infer_manager.h"
+#include "coder/opcoders/op_coder.h"
+#include "coder/utils/coder_utils.h"
+#include "tools/common/string_util.h"
+
+namespace mindspore::lite::micro {
+int ShapeInfoContainer::Init(const std::vector<std::unique_ptr<OperatorCoder>> &nodes_coder,
+                             const std::map<Tensor *, std::vector<std::vector<int>>> &graph_inputs) {
+  MS_CHECK_TRUE_MSG(!graph_inputs.empty(), RET_ERROR, "Cannot get graph_inputs's shape-info");
+  auto scene_num = graph_inputs.begin()->second.size();
+  for (const auto &item : graph_inputs) {
+    MS_CHECK_TRUE_MSG(item.first, RET_NULL_PTR, "Find a nullptr in graph_inputs");
+    MS_CHECK_TRUE_MSG(item.second.size() == scene_num, RET_ERROR, "Graph inputs are invalid.");
+  }
+  var_tensor_shapes_.insert(graph_inputs.begin(), graph_inputs.end());
+  for (size_t i = 0; i < scene_num; ++i) {
+    for (const auto &item : graph_inputs) {
+      item.first->set_shape(item.second[i]);
+    }
+    for (const auto &node_coder : nodes_coder) {
+      auto in_tensors = node_coder->input_tensors();
+      auto out_tensors = node_coder->output_tensors();
+      auto op_param = node_coder->get_parameter();
+      MS_CHECK_TRUE_MSG(op_param, RET_NULL_PTR, "NodeCoder's op_param is a nullptr.");
+      auto node = node_coder->node();
+      MS_CHECK_TRUE_MSG(node, RET_NULL_PTR, "NodeCoder's node is a nullptr.");
+      auto prim = node->primitive_;
+      auto ret = DoInferShape(in_tensors, out_tensors, op_param, prim);
+      MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "ShapeInfoContainer Init failed.");
+    }
+  }
+  auto ret = DetermineShapeVarInfos();
+  MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "DetermineShapeVarInfos failed.");
+  return RET_OK;
+}
+
+int ShapeInfoContainer::DoInferShape(const std::vector<Tensor *> &in_tensors, std::vector<Tensor *> &out_tensors,
+                                     OpParameter *op_param, const void *primitive) {
+  auto ret = KernelInferShape(in_tensors, out_tensors, primitive, {}, lite::SCHEMA_CUR);
+  if (ret == lite::RET_NOT_SUPPORT) {
+    ret = KernelInferShape(in_tensors, out_tensors, op_param);
+  }
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Infer shape failed.";
+    return ret;
+  }
+  for (const auto out_tensor : out_tensors) {
+    var_tensor_shapes_[out_tensor].push_back(out_tensor->shape());
+  }
+  return RET_OK;
+}
+
+int ShapeInfoContainer::DetermineShapeVarInfos() {
+  MS_CHECK_TRUE_MSG(kShapePrefixName, RET_NULL_PTR, "kShapePrefixName is a nullptr.");
+  int index = 0;
+  for (const auto &item : var_tensor_shapes_) {
+    auto &tensor = item.first;
+    auto &shapes = item.second;
+    MS_CHECK_TRUE_MSG(!shapes.empty(), RET_ERROR, "Cannot get some tensor's shape.");
+    auto shape = shapes.front();
+    auto dims = shape.size();
+    auto is_same_dim =
+      std::all_of(shapes.begin(), shapes.end(), [dims](const std::vector<int> &item) { return item.size() == dims; });
+    MS_CHECK_TRUE_MSG(is_same_dim, RET_ERROR, "Tensor's shape-dims-num are not same.");
+    std::vector<std::string> shape_symbols;
+    for (size_t i = 0; i < dims; ++i) {
+      int dim = shape[i];
+      std::vector<int> real_nums;
+      auto is_same_pos =
+        std::all_of(shapes.begin(), shapes.end(), [dim, i](const std::vector<int> &item) { return item[i] == dim; });
+      if (is_same_pos) {
+        shape_symbols.push_back(std::to_string(dim));
+        continue;
+      }
+      (void)std::transform(shapes.begin(), shapes.end(), std::back_inserter(real_nums),
+                           [i](const std::vector<int> &item) { return item[i]; });
+      std::string shape_symbol;
+      for (const auto &shape_to_num : shape_to_nums_) {
+        if (shape_to_num.second == real_nums) {
+          shape_symbol = shape_to_num.first;
+          break;
+        }
+      }
+      if (shape_symbol.empty()) {
+        for (size_t scene_index = 0; scene_index < real_nums.size(); ++scene_index) {
+          shapes_whole_scenes_[scene_index].push_back(real_nums[scene_index]);
+        }
+        shape_symbol = std::string(kShapePrefixName) + "[" + std::to_string(index++) + "]";
+        shape_to_nums_[shape_symbol] = real_nums;
+      }
+      shape_symbols.push_back(shape_symbol);
+    }
+    shape_templates_[tensor] = shape_symbols;
+  }
+  return RET_OK;
+}
+
+std::vector<std::string> ShapeInfoContainer::GetTemplateShape(const Tensor *tensor) const {
+  if (shape_templates_.find(tensor) == shape_templates_.end()) {
+    return {};
+  }
+  return shape_templates_.at(tensor);
+}
+
+std::vector<int> ShapeInfoContainer::GetRealNums(const std::string &shape_var) const {
+  if (IsNumber(shape_var)) {
+    return {std::stoi(shape_var)};
+  }
+  if (shape_to_nums_.find(shape_var) == shape_to_nums_.end()) {
+    return {};
+  }
+  return shape_to_nums_.at(shape_var);
+}
+}  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/shape_info_container.h b/mindspore/lite/tools/converter/micro/coder/shape_info_container.h
new file mode 100644
index 00000000..9268b249
--- /dev/null
+++ b/mindspore/lite/tools/converter/micro/coder/shape_info_container.h
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_SHAPE_INFO_CONTAINER_H_
+#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_SHAPE_INFO_CONTAINER_H_
+
+#include <vector>
+#include <string>
+#include <map>
+#include "tools/converter/micro/coder/config.h"
+#include "include/model.h"
+#include "src/tensor.h"
+#include "nnacl/op_base.h"
+
+namespace mindspore::lite::micro {
+class OperatorCoder;
+class ShapeInfoContainer {
+ public:
+  ShapeInfoContainer() = default;
+  ~ShapeInfoContainer() = default;
+
+  int Init(const std::vector<std::unique_ptr<OperatorCoder>> &nodes_coder,
+           const std::map<Tensor *, std::vector<std::vector<int>>> &graph_inputs);
+
+  const std::map<Tensor *, std::vector<std::vector<int>>> &GetVarTensorInfos() const { return var_tensor_shapes_; }
+
+  std::vector<std::string> GetTemplateShape(const Tensor *tensor) const;
+
+  const std::map<const Tensor *, std::vector<std::string>> &GetWholeTemplateShape() { return shape_templates_; }
+
+  std::vector<int> GetRealNums(const std::string &shape_var) const;
+
+  const std::map<int, std::vector<int>> &GetShapesWholeScenes() const { return shapes_whole_scenes_; }
+
+ private:
+  int DoInferShape(const std::vector<Tensor *> &in_tensors, std::vector<Tensor *> &out_tensors, OpParameter *op_param,
+                   const void *primitive);
+  int DetermineShapeVarInfos();
+  std::map<Tensor *, std::vector<std::vector<int>>> var_tensor_shapes_;
+  std::map<const Tensor *, std::vector<std::string>> shape_templates_;
+  std::map<std::string, std::vector<int>> shape_to_nums_;
+  std::map<int, std::vector<int>> shapes_whole_scenes_;
+  Model *model_{nullptr};
+};
+}  // namespace mindspore::lite::micro
+#endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_SHAPE_INFO_CONTAINER_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.cc b/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.cc
index c86a967d..a4c15c83 100644
--- a/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.cc
+++ b/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2022 Huawei Technologies Co., Ltd
+ * Copyright 2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "tools/converter/micro/coder/log.h"
 #include "tools/converter/micro/coder/utils/type_cast.h"
 #include "tools/converter/micro/coder/allocator/allocator.h"
+#include "tools/common/string_util.h"

 namespace mindspore::lite::micro {
 bool CheckConstantTensor(const Tensor *const tensor) {
@@ -145,4 +146,36 @@ std::vector<std::string> SplitString(std::string str, const std::string &pattern
   }
   return results;
 }
+
+std::string AccumulateShape(const std::vector<std::string> &shape_template, size_t start_index, size_t end_index) {
+  int64_t const_part = 1;
+  std::string non_const_part;
+  for (size_t i = start_index; i < end_index; ++i) {
+    auto item = shape_template[i];
+    if (IsNumber(item)) {
+      const_part *= std::stoi(item);
+    } else {
+      if (!non_const_part.empty()) {
+        non_const_part += " * ";
+      }
+      non_const_part += item;
+    }
+  }
+  std::string accumulate_shape = std::to_string(const_part);
+  if (!non_const_part.empty()) {
+    accumulate_shape += " * " + non_const_part;
+  }
+  return accumulate_shape;
+}
+
+std::string GetTensorAddr(lite::Tensor *tensor, bool is_const, DynamicMemManager *dynamic_mem_manager,
+                          MemoryAllocator *allocator) {
+  if (is_const) {
+    return allocator->GetRuntimeAddr(tensor, true);
+  }
+  if (dynamic_mem_manager == nullptr) {
+    return allocator->GetRuntimeAddr(tensor);
+  }
+  return dynamic_mem_manager->GetVarTensorAddr(tensor);
+}
 }  // namespace mindspore::lite::micro
diff --git a/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.h b/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.h
index eabae70e..70a973cb 100644
--- a/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.h
+++ b/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.h
@@ -41,5 +41,10 @@ std::string ArrayToString(std::vector<T> array) {
   std::for_each(array.begin(), array.end(), [&result](const T &t) { result += std::to_string(t) + ", "; });
   return "{" + result + "}";
 }
+
+std::string AccumulateShape(const std::vector<std::string> &shape_template, size_t start_index, size_t end_index);
+
+std::string GetTensorAddr(lite::Tensor *tensor, bool is_const, DynamicMemManager *dynamic_mem_manager,
+                          MemoryAllocator *allocator);
 }  // namespace mindspore::lite::micro
 #endif  // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_UTILS_CODER_UTILS_H_
diff --git a/mindspore/lite/tools/converter/micro/coder/utils/type_cast.cc b/mindspore/lite/tools/converter/micro/coder/utils/type_cast.cc
index 61b22bae..1d3c02a0 100644
--- a/mindspore/lite/tools/converter/micro/coder/utils/type_cast.cc
+++ b/mindspore/lite/tools/converter/micro/coder/utils/type_cast.cc
@@ -54,32 +54,30 @@ std::string EnumNameDataType(TypeId type) {
 std::string EnumNameMSDataType(TypeId type) {
   switch (type) {
     case kNumberTypeInt:
-      return "kMSDataTypeNumberTypeInt32";
+    case kNumberTypeInt32:
+      return "OH_AI_DATATYPE_NUMBERTYPE_INT32";
     case kNumberTypeInt8:
-      return "kMSDataTypeNumberTypeInt8";
+      return "OH_AI_DATATYPE_NUMBERTYPE_INT8";
     case kNumberTypeInt16:
-      return "kMSDataTypeNumberTypeInt16";
-    case kNumberTypeInt32:
-      return "kMSDataTypeNumberTypeInt32";
+      return "OH_AI_DATATYPE_NUMBERTYPE_INT16";
     case kNumberTypeInt64:
-      return "kMSDataTypeNumberTypeUInt64";
+      return "OH_AI_DATATYPE_NUMBERTYPE_INT64";
     case kNumberTypeUInt:
-      return "kMSDataTypeNumberTypeUInt32";
+    case kNumberTypeUInt32:
+      return "OH_AI_DATATYPE_NUMBERTYPE_UINT32";
     case kNumberTypeUInt8:
-      return "kMSDataTypeNumberTypeUInt8";
+      return "OH_AI_DATATYPE_NUMBERTYPE_UINT8";
     case kNumberTypeUInt16:
-      return "kMSDataTypeNumberTypeUInt16";
-    case kNumberTypeUInt32:
-      return "kMSDataTypeNumberTypeUInt32";
+      return "OH_AI_DATATYPE_NUMBERTYPE_UINT16";
     case kNumberTypeFloat:
     case kNumberTypeFloat32:
-      return "kMSDataTypeNumberTypeFloat32";
+      return "OH_AI_DATATYPE_NUMBERTYPE_FLOAT32";
     case kNumberTypeFloat16:
-      return "kMSDataTypeNumberTypeFloat16";
+      return "OH_AI_DATATYPE_NUMBERTYPE_FLOAT16";
     case kNumberTypeFloat64:
-      return "kMSDataTypeNumberTypeFloat64";
+      return "OH_AI_DATATYPE_NUMBERTYPE_FLOAT64";
     case kTypeUnknown:
-      return "kMSDataTypeUnknown";
+      return "OH_AI_DATATYPE_UNKNOWN";
     default:
       return "unsupported";
   }
diff --git a/mindspore/lite/tools/converter/parser/third_party/third_party_model_parser.cc b/mindspore/lite/tools/converter/parser/third_party/third_party_model_parser.cc
index 652db4af..a82feb07 100644
--- a/mindspore/lite/tools/converter/parser/third_party/third_party_model_parser.cc
+++ b/mindspore/lite/tools/converter/parser/third_party/third_party_model_parser.cc
@@ -62,7 +62,7 @@ STATUS ThirdPartyModelParser::InitConfig(const std::string &config_file) {
     MS_LOG(ERROR) << "Missing config file in converting third party model";
     return RET_ERROR;
   }
-  auto ret = config_parser.ParseConfigFile(config_file);
+  auto ret = config_parser.ParseConfigFile(config_file, nullptr);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Get third party model section from config file failed";
     return RET_ERROR;
diff --git a/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.cc b/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.cc
new file mode 100644
index 00000000..4caef237
--- /dev/null
+++ b/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.cc
@@ -0,0 +1,120 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define USE_DEPRECATED_API
+#include "tools/optimizer/fusion/tile_matmul_fusion.h"
+#include <memory>
+#include "tools/optimizer/common/gllo_utils.h"
+#include "nnacl/op_base.h"
+#include "tools/lite_exporter/fetch_content.h"
+#include "ops/op_utils.h"
+#include "ops/lite_ops.h"
+#include "ops/fusion/tile_fusion.h"
+#include "ops/fusion/mat_mul_fusion.h"
+
+namespace mindspore {
+namespace opt {
+bool TileMatMulFusion::CheckCanFuse(const FuncGraphPtr &func_graph, const AnfNodePtr &node) const {
+  auto tile_cnode = node->cast<CNodePtr>();
+  MS_CHECK_TRUE_RET(tile_cnode != nullptr, false);
+  auto tile_primc = ops::GetOperator<ops::TileFusion>(tile_cnode->input(0));
+  MS_CHECK_TRUE_RET(tile_primc != nullptr, false);
+  auto tile_prim_c = tile_primc->GetPrim();
+  MS_CHECK_TRUE_RET(tile_prim_c != nullptr, false);
+  if (IsQuantParameterNode(tile_prim_c)) {
+    MS_LOG(INFO) << tile_primc->name() << " is quant node";
+    return false;
+  }
+  auto manager = func_graph->manager();
+  MS_CHECK_TRUE_RET(manager != nullptr, false);
+  auto node_users = manager->node_users()[tile_cnode];
+  for (auto &node_user : node_users) {
+    auto post_node = node_user.first;
+    auto post_node_index = node_user.second;
+    if (!utils::isa<CNode>(post_node) || !CheckPrimitiveType(post_node, prim::kPrimMatMulFusion) ||
+        post_node_index != C2NUM) {
+      MS_LOG(INFO) << "The post node of tile must be matmul's matirxB.";
+      return false;
+    }
+    auto matmul_primc = ops::GetOperator<ops::MatMulFusion>(GetInputs(post_node).at(0));
+    MS_CHECK_TRUE_RET(matmul_primc != nullptr, false);
+    auto matmul_prim_c = matmul_primc->GetPrim();
+    MS_CHECK_TRUE_RET(matmul_prim_c != nullptr, false);
+    if (IsQuantParameterNode(matmul_prim_c)) {
+      MS_LOG(INFO) << matmul_prim_c->name() << " is quant node";
+      return false;
+    }
+  }
+
+  lite::DataInfo data_info;
+  auto status = lite::FetchConstData(tile_cnode, C2NUM, converter::kFmkTypeMs, &data_info, false);
+  MS_CHECK_TRUE_MSG(status == RET_OK, false, "Fetch tile_cnode third input's const data failed.");
+  if ((data_info.data_type_ != kNumberTypeInt32 && data_info.data_type_ != kNumberTypeInt) ||
+      data_info.data_.size() / sizeof(int) < DIMENSION_2D) {
+    MS_LOG(INFO) << "Tile index data is invalid.";
+    return false;
+  }
+  auto data = reinterpret_cast<int *>(data_info.data_.data());
+  int dim = static_cast<int>(data_info.data_.size() / sizeof(int));
+  for (int i = dim - C1NUM; i > dim - C3NUM; --i) {
+    if (data[i] != C1NUM) {
+      return false;
+    }
+  }
+  lite::DataInfo weights_info;
+  auto left_pre_node = tile_cnode->input(C1NUM);
+  if (left_pre_node->isa<Parameter>() || left_pre_node->isa<ValueNode>()) {
+    status = lite::FetchConstData(tile_cnode, C1NUM, converter::kFmkTypeMs, &weights_info, false);
+  } else {
+    status = lite::FetchDataFromCNode(tile_cnode, C1NUM, &weights_info);
+  }
+  MS_CHECK_TRUE_RET(status == RET_OK, false);
+  MS_CHECK_TRUE_MSG(weights_info.shape_.size() == static_cast<size_t>(dim), false,
+                    "Tile_cnode second input's shape size is invalid.");
+  for (int i = 0; i < dim - C2NUM; i++) {
+    if (data[i] != C1NUM && weights_info.shape_[i] != C1NUM) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool TileMatMulFusion::Run(const FuncGraphPtr &func_graph) {
+  MS_CHECK_TRUE_RET(func_graph != nullptr, false);
+  auto node_list = TopoSort(func_graph->get_return());
+  for (auto &node : node_list) {
+    MS_CHECK_TRUE_RET(node != nullptr, false);
+    if (!utils::isa<CNode>(node)) {
+      continue;
+    }
+    if (!CheckPrimitiveType(node, prim::kPrimTileFusion)) {
+      continue;
+    }
+    if (!CheckCanFuse(func_graph, node)) {
+      continue;
+    }
+    auto tile_cnode = node->cast<CNodePtr>();
+    MS_CHECK_TRUE_RET(tile_cnode != nullptr, false);
+    auto left_pre_node = tile_cnode->input(SECOND_INPUT);
+    auto manage = func_graph->manager();
+    MS_CHECK_TRUE_RET(manage != nullptr, false);
+    auto success = manage->Replace(tile_cnode, left_pre_node);
+    MS_CHECK_TRUE_MSG(success, false, "Replace old node failed.");
+  }
+  return true;
+}
+}  // namespace opt
+}  // namespace mindspore
diff --git a/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.h b/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.h
new file mode 100644
index 00000000..280dc265
--- /dev/null
+++ b/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_TILE_MATMUL_FUSION_H_
+#define MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_TILE_MATMUL_FUSION_H_
+
+#include <string>
+#include "tools/optimizer/common/multiple_pattern_process_pass.h"
+#include "utils/check_convert_utils.h"
+
+namespace mindspore {
+namespace opt {
+class TileMatMulFusion : public Pass {
+ public:
+  TileMatMulFusion() : Pass("TileMatMulFusion") {}
+  ~TileMatMulFusion() override = default;
+  bool Run(const FuncGraphPtr &func_graph) override;
+
+ private:
+  bool CheckCanFuse(const FuncGraphPtr &func_graph, const AnfNodePtr &node) const;
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_TILE_MATMUL_FUSION_H_
diff --git a/mindspore/python/mindspore/ops/operations/_grad_ops.py b/mindspore/python/mindspore/ops/operations/_grad_ops.py
index 59c9c883..5714b832 100644
--- a/mindspore/python/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/python/mindspore/ops/operations/_grad_ops.py
@@ -1521,7 +1521,7 @@ class LSTMGrad(Primitive):
     """Computes the data and weight gradients of LSTM."""

     @prim_attr_register
-    def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+    def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout, proj_size=0):
         self.input_size = validator.check_positive_int(input_size, 'input_size', self.name)
         self.hidden_size = validator.check_positive_int(hidden_size, 'hidden_size', self.name)
         self.num_layers = validator.check_positive_int(num_layers, 'num_layers', self.name)
@@ -1529,12 +1529,53 @@ class LSTMGrad(Primitive):
         self.bidirectional = validator.check_value_type('bidirectional', bidirectional, (bool,), self.name)
         self.dropout = validator.check_value_type("dropout", dropout, [float], self.name)
         self.dropout = validator.check_float_range(dropout, 0, 1, validator.INC_BOTH, 'dropout', self.name)
+        self.proj_size = validator.check_int_range(proj_size, 0, hidden_size, Rel.INC_LEFT,
+                                                   'proj_size', self.name)
+

         if bidirectional:
             self.num_directions = 2
         else:
             self.num_directions = 1

+    def infer_shape(self, x_shape, hx_shape, cx_shape, w_shape, y_shape, hy_shape, cy_shape, dy_shape, dhy_shape,
+                    dcy_shape, reserve_shape):
+        # dhy and dcy should be same shape
+        validator.check_equal_int(len(dhy_shape), 3, "h_shape", self.name)
+        validator.check_equal_int(len(dhy_shape), len(dcy_shape), "h_shape", self.name)
+        if self.proj_size == 0:
+            validator.check_equal_int(dhy_shape[0], dcy_shape[0], "h_shape[0]", self.name)
+            validator.check_equal_int(dhy_shape[1], dcy_shape[1], "h_shape[1]", self.name)
+            validator.check_equal_int(dhy_shape[2], dcy_shape[2], "h_shape[2]", self.name)
+
+        real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size
+        validator.check_int(dhy_shape[0], self.num_layers * self.num_directions, Rel.EQ, "h_shape[0]", self.name)
+        validator.check_equal_int(dhy_shape[2], real_hidden_size, "h_shape[2]", self.name)
+
+        validator.check_equal_int(len(dy_shape), 3, "dy_shape", self.name)
+        validator.check_equal_int(dy_shape[1], dhy_shape[1], "dy[1]", self.name)
+        validator.check_int(dy_shape[2], real_hidden_size * self.num_directions, Rel.EQ, "dy[2]", self.name)
+
+        dx_shape = (y_shape[0], y_shape[1], self.input_size)
+        dhx_shape = dhy_shape
+        dcx_shape = dcy_shape
+        weight_size = 0
+        gate_size = 4 * self.hidden_size
+        for layer in range(self.num_layers):
+            for _ in range(self.num_directions):
+                input_layer_size = self.input_size if layer == 0 else self.hidden_size * self.num_directions
+                weight_size += gate_size * input_layer_size
+                weight_size += gate_size * real_hidden_size
+                if self.proj_size > 0:
+                    weight_size += self.proj_size * self.hidden_size
+                if self.has_bias:
+                    weight_size += gate_size
+
+        return (dx_shape, dhx_shape, dcx_shape, (weight_size, 1, 1))
+
+    def infer_dtype(self, x_dtype, hx_dtype, cx_dtype, w_dtype, y_dtype, hy_dtype, cy_dtype, dy_dtype, dhy_dtype,
+                    dcy_dtype, reserve_dtype):
+        return (dy_dtype, dy_dtype, dy_dtype, hx_dtype)

 class DynamicRNNGrad(Primitive):
     """Computes the input gradients of DynamicRNN."""
diff --git a/mindspore/python/mindspore/ops/operations/nn_ops.py b/mindspore/python/mindspore/ops/operations/nn_ops.py
index 3a0eb3d6..8ae747be 100644
--- a/mindspore/python/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/python/mindspore/ops/operations/nn_ops.py
@@ -4356,7 +4356,7 @@ class LSTM(Primitive):
     """

     @prim_attr_register
-    def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+    def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout, proj_size=0):
         """Initialize LSTM."""
         self.input_size = validator.check_positive_int(input_size, "input_size", self.name)
         self.hidden_size = validator.check_positive_int(hidden_size, "hidden_size", self.name)
@@ -4365,12 +4365,40 @@ class LSTM(Primitive):
         self.bidirectional = validator.check_value_type("bidirectional", bidirectional, (bool,), self.name)
         self.dropout = validator.check_value_type("dropout", dropout, [float], self.name)
         self.dropout = validator.check_float_range(dropout, 0, 1, validator.INC_BOTH, 'dropout', self.name)
+        self.proj_size = validator.check_int_range(proj_size, 0, hidden_size, validator.INC_LEFT,
+                                                   'proj_size', self.name)

         if bidirectional:
             self.num_directions = 2
         else:
             self.num_directions = 1

+    def infer_shape(self, x_shape, h_shape, c_shape, w_shape):
+        validator.check_equal_int(len(x_shape), 3, "x rank", self.name)
+        validator.check_equal_int(x_shape[2], self.input_size, "x[2]", self.name)
+
+        # h and c should be same shape
+        validator.check_equal_int(len(h_shape), 3, "h rank", self.name)
+        if self.proj_size == 0:
+            validator.check("h_shape", h_shape, "c_shape", c_shape, Rel.EQ, self.name)
+
+        real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size
+        validator.check_int(h_shape[0], self.num_layers * self.num_directions, Rel.EQ, "h[0]", self.name)
+        validator.check_equal_int(h_shape[1], x_shape[1], "h[1]", self.name)
+        validator.check_int(h_shape[2], real_hidden_size, Rel.EQ, "h[2]", self.name)
+
+        y_shape = (x_shape[0], x_shape[1], real_hidden_size * self.num_directions)
+
+        # set arbitrary shape for reserved space
+        reserved_shape = (1, 1)
+        state_shape = (1, 1)
+        return y_shape, h_shape, c_shape, reserved_shape, state_shape
+
+    def infer_dtype(self, x_dtype, h_dtype, c_dtype, w_dtype):
+        args = {'x': x_dtype, 'h': h_dtype, 'c': c_dtype, 'w': w_dtype}
+        validator.check_tensors_dtypes_same_and_valid(args, (mstype.float32, mstype.float16), self.name)
+        return x_dtype, x_dtype, x_dtype, x_dtype, x_dtype
+

 class SigmoidCrossEntropyWithLogits(Primitive):
     r"""