mindspore/patches/0002-generate-nnacl-simd-headers-manually.patch

From e6e35ad9f7b4c0c99d2f9b62c7d199dd3bf487dc Mon Sep 17 00:00:00 2001
From: Zhu Guodong <zhuguodong0001@163.com>
Date: Mon, 6 Mar 2023 16:02:57 +0800
Subject: [PATCH 2/4] generate nnacl simd headers manually

---
 .../include/nnacl/activation_fp32_simd.h      |  36 +++
 .../include/nnacl/activation_grad_simd.h      |  36 +++
 .../nnacl/include/nnacl/adam_fp32_simd.h      |  36 +++
 .../nnacl/include/nnacl/add_fp32_simd.h       |  36 +++
 .../include/nnacl/arithmetic_fp32_simd.h      |  36 +++
 .../include/nnacl/arithmetic_self_fp32_simd.h |  36 +++
 .../include/nnacl/avx/activation_fp32_avx.h   | 221 +++++++++++++++
 .../include/nnacl/avx/activation_grad_avx.h   |  57 ++++
 .../nnacl/include/nnacl/avx/adam_fp32_avx.h   | 210 +++++++++++++++
 .../nnacl/include/nnacl/avx/add_fp32_avx.h    | 124 +++++++++
 .../include/nnacl/avx/arithmetic_fp32_avx.h   | 254 ++++++++++++++++++
 .../nnacl/avx/arithmetic_self_fp32_avx.h      | 129 +++++++++
 .../include/nnacl/avx/batchnorm_fp32_avx.h    |  67 +++++
 .../nnacl/avx/bce_with_logits_loss_fp32_avx.h |  69 +++++
 .../nnacl/include/nnacl/avx/bias_add_avx.h    |  64 +++++
 .../nnacl/include/nnacl/avx/cast_base_avx.h   |  56 ++++
 .../nnacl/include/nnacl/avx/cdist_fp32_avx.h  |  70 +++++
 .../nnacl/include/nnacl/avx/cumsum_fp32_avx.h | 121 +++++++++
 .../nnacl/include/nnacl/avx/div_fp32_avx.h    | 167 ++++++++++++
 .../include/nnacl/avx/dropout_fp32_avx.h      |  46 ++++
 .../nnacl/include/nnacl/avx/exp_fp32_avx.h    |  63 +++++
 .../nnacl/include/nnacl/avx/fill_base_avx.h   |  53 ++++
 .../include/nnacl/avx/group_norm_fp32_avx.h   |  77 ++++++
 .../include/nnacl/avx/layer_norm_fp32_avx.h   |  68 +++++
 .../nnacl/include/nnacl/avx/matmul_fp32_avx.h |  93 +++++++
 .../nnacl/include/nnacl/avx/mul_fp32_avx.h    | 218 +++++++++++++++
 .../include/nnacl/avx/pooling_fp32_avx.h      |  84 ++++++
 .../nnacl/include/nnacl/avx/power_fp32_avx.h  | 101 +++++++
 .../nnacl/include/nnacl/avx/reduce_fp32_avx.h | 181 +++++++++++++
 .../include/nnacl/avx/softmax_fp32_avx.h      |  87 ++++++
 .../nnacl/include/nnacl/avx/sub_fp32_avx.h    | 167 ++++++++++++
 .../nnacl/avx512/activation_fp32_avx512.h     | 221 +++++++++++++++
 .../nnacl/avx512/activation_grad_avx512.h     |  57 ++++
 .../include/nnacl/avx512/adam_fp32_avx512.h   | 210 +++++++++++++++
 .../include/nnacl/avx512/add_fp32_avx512.h    | 124 +++++++++
 .../nnacl/avx512/arithmetic_fp32_avx512.h     | 254 ++++++++++++++++++
 .../avx512/arithmetic_self_fp32_avx512.h      | 129 +++++++++
 .../nnacl/avx512/batchnorm_fp32_avx512.h      |  67 +++++
 .../avx512/bce_with_logits_loss_fp32_avx512.h |  69 +++++
 .../include/nnacl/avx512/bias_add_avx512.h    |  64 +++++
 .../include/nnacl/avx512/cast_base_avx512.h   |  56 ++++
 .../include/nnacl/avx512/cdist_fp32_avx512.h  |  70 +++++
 .../include/nnacl/avx512/cumsum_fp32_avx512.h | 121 +++++++++
 .../include/nnacl/avx512/div_fp32_avx512.h    | 167 ++++++++++++
 .../nnacl/avx512/dropout_fp32_avx512.h        |  46 ++++
 .../include/nnacl/avx512/exp_fp32_avx512.h    |  63 +++++
 .../include/nnacl/avx512/fill_base_avx512.h   |  53 ++++
 .../nnacl/avx512/group_norm_fp32_avx512.h     |  77 ++++++
 .../nnacl/avx512/layer_norm_fp32_avx512.h     |  68 +++++
 .../include/nnacl/avx512/matmul_fp32_avx512.h |  93 +++++++
 .../include/nnacl/avx512/mul_fp32_avx512.h    | 218 +++++++++++++++
 .../nnacl/avx512/pooling_fp32_avx512.h        |  84 ++++++
 .../include/nnacl/avx512/power_fp32_avx512.h  | 101 +++++++
 .../include/nnacl/avx512/reduce_fp32_avx512.h | 181 +++++++++++++
 .../nnacl/avx512/softmax_fp32_avx512.h        |  87 ++++++
 .../include/nnacl/avx512/sub_fp32_avx512.h    | 167 ++++++++++++
 .../nnacl/include/nnacl/batchnorm_fp32_simd.h |  36 +++
 .../nnacl/bce_with_logits_loss_fp32_simd.h    |  36 +++
 .../nnacl/include/nnacl/bias_add_simd.h       |  36 +++
 .../nnacl/include/nnacl/cast_base_simd.h      |  36 +++
 .../nnacl/include/nnacl/cdist_fp32_simd.h     |  36 +++
 .../nnacl/include/nnacl/cumsum_fp32_simd.h    |  36 +++
 .../nnacl/include/nnacl/div_fp32_simd.h       |  36 +++
 .../nnacl/include/nnacl/dropout_fp32_simd.h   |  36 +++
 .../nnacl/include/nnacl/exp_fp32_simd.h       |  36 +++
 .../nnacl/include/nnacl/fill_base_simd.h      |  36 +++
 .../include/nnacl/group_norm_fp32_simd.h      |  36 +++
 .../include/nnacl/layer_norm_fp32_simd.h      |  36 +++
 .../nnacl/include/nnacl/matmul_fp32_simd.h    |  36 +++
 .../nnacl/include/nnacl/mul_fp32_simd.h       |  36 +++
 .../include/nnacl/neon/activation_fp32_neon.h | 220 +++++++++++++++
 .../include/nnacl/neon/activation_grad_neon.h |  56 ++++
 .../nnacl/include/nnacl/neon/adam_fp32_neon.h | 209 ++++++++++++++
 .../nnacl/include/nnacl/neon/add_fp32_neon.h  | 123 +++++++++
 .../include/nnacl/neon/arithmetic_fp32_neon.h | 253 +++++++++++++++++
 .../nnacl/neon/arithmetic_self_fp32_neon.h    | 128 +++++++++
 .../include/nnacl/neon/batchnorm_fp32_neon.h  |  66 +++++
 .../neon/bce_with_logits_loss_fp32_neon.h     |  68 +++++
 .../nnacl/include/nnacl/neon/bias_add_neon.h  |  63 +++++
 .../nnacl/include/nnacl/neon/cast_base_neon.h |  55 ++++
 .../include/nnacl/neon/cdist_fp32_neon.h      |  69 +++++
 .../include/nnacl/neon/cumsum_fp32_neon.h     | 120 +++++++++
 .../nnacl/include/nnacl/neon/div_fp32_neon.h  | 166 ++++++++++++
 .../include/nnacl/neon/dropout_fp32_neon.h    |  45 ++++
 .../nnacl/include/nnacl/neon/exp_fp32_neon.h  |  62 +++++
 .../nnacl/include/nnacl/neon/fill_base_neon.h |  52 ++++
 .../include/nnacl/neon/group_norm_fp32_neon.h |  76 ++++++
 .../include/nnacl/neon/layer_norm_fp32_neon.h |  67 +++++
 .../include/nnacl/neon/matmul_fp32_neon.h     |  92 +++++++
 .../nnacl/include/nnacl/neon/mul_fp32_neon.h  | 217 +++++++++++++++
 .../include/nnacl/neon/pooling_fp32_neon.h    |  83 ++++++
 .../include/nnacl/neon/power_fp32_neon.h      | 100 +++++++
 .../include/nnacl/neon/reduce_fp32_neon.h     | 180 +++++++++++++
 .../include/nnacl/neon/softmax_fp32_neon.h    |  86 ++++++
 .../nnacl/include/nnacl/neon/sub_fp32_neon.h  | 166 ++++++++++++
 .../nnacl/include/nnacl/pooling_fp32_simd.h   |  36 +++
 .../nnacl/include/nnacl/power_fp32_simd.h     |  36 +++
 .../nnacl/include/nnacl/reduce_fp32_simd.h    |  36 +++
 .../nnacl/include/nnacl/softmax_fp32_simd.h   |  36 +++
 .../include/nnacl/sse/activation_fp32_sse.h   | 221 +++++++++++++++
 .../include/nnacl/sse/activation_grad_sse.h   |  57 ++++
 .../nnacl/include/nnacl/sse/adam_fp32_sse.h   | 210 +++++++++++++++
 .../nnacl/include/nnacl/sse/add_fp32_sse.h    | 124 +++++++++
 .../include/nnacl/sse/arithmetic_fp32_sse.h   | 254 ++++++++++++++++++
 .../nnacl/sse/arithmetic_self_fp32_sse.h      | 129 +++++++++
 .../include/nnacl/sse/batchnorm_fp32_sse.h    |  67 +++++
 .../nnacl/sse/bce_with_logits_loss_fp32_sse.h |  69 +++++
 .../nnacl/include/nnacl/sse/bias_add_sse.h    |  64 +++++
 .../nnacl/include/nnacl/sse/cast_base_sse.h   |  56 ++++
 .../nnacl/include/nnacl/sse/cdist_fp32_sse.h  |  70 +++++
 .../nnacl/include/nnacl/sse/cumsum_fp32_sse.h | 121 +++++++++
 .../nnacl/include/nnacl/sse/div_fp32_sse.h    | 167 ++++++++++++
 .../include/nnacl/sse/dropout_fp32_sse.h      |  46 ++++
 .../nnacl/include/nnacl/sse/exp_fp32_sse.h    |  63 +++++
 .../nnacl/include/nnacl/sse/fill_base_sse.h   |  53 ++++
 .../include/nnacl/sse/group_norm_fp32_sse.h   |  77 ++++++
 .../include/nnacl/sse/layer_norm_fp32_sse.h   |  68 +++++
 .../nnacl/include/nnacl/sse/matmul_fp32_sse.h |  93 +++++++
 .../nnacl/include/nnacl/sse/mul_fp32_sse.h    | 218 +++++++++++++++
 .../include/nnacl/sse/pooling_fp32_sse.h      |  84 ++++++
 .../nnacl/include/nnacl/sse/power_fp32_sse.h  | 101 +++++++
 .../nnacl/include/nnacl/sse/reduce_fp32_sse.h | 181 +++++++++++++
 .../include/nnacl/sse/softmax_fp32_sse.h      |  87 ++++++
 .../nnacl/include/nnacl/sse/sub_fp32_sse.h    | 167 ++++++++++++
 .../nnacl/include/nnacl/sub_fp32_simd.h       |  36 +++
 125 files changed, 12263 insertions(+)
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h
 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h

diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h
new file mode 100644
index 00000000..fead4fd3
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_ACTIVATION_FP32_SIMD_H_
+#define MINDSPORE_NNACL_ACTIVATION_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/activation_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/activation_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/activation_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/activation_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h
new file mode 100644
index 00000000..c8637379
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_ACTIVATION_GRAD_SIMD_H_
+#define MINDSPORE_NNACL_ACTIVATION_GRAD_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/activation_grad_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/activation_grad_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/activation_grad_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/activation_grad_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h
new file mode 100644
index 00000000..267799ed
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_ADAM_FP32_SIMD_H_
+#define MINDSPORE_NNACL_ADAM_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/adam_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/adam_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/adam_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/adam_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h
new file mode 100644
index 00000000..83cd76ec
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_ADD_FP32_SIMD_H_
+#define MINDSPORE_NNACL_ADD_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/add_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/add_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/add_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/add_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h
new file mode 100644
index 00000000..898fe882
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_ARITHMETIC_FP32_SIMD_H_
+#define MINDSPORE_NNACL_ARITHMETIC_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/arithmetic_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/arithmetic_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/arithmetic_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/arithmetic_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h
new file mode 100644
index 00000000..676b53ec
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_FP32_SIMD_H_
+#define MINDSPORE_NNACL_ARITHMETIC_SELF_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/arithmetic_self_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/arithmetic_self_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/arithmetic_self_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/arithmetic_self_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h
new file mode 100644
index 00000000..49edf7ec
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h
@@ -0,0 +1,221 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int Fp32ReluAVX(int index, const float *src, int length, float *dst) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero));
+    }
+    return index;
+}
+
+static inline int Int32ReluAVX(int index, const int32_t *src, int length, int32_t *dst) {
+    SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero));
+    }
+    return index;
+}
+
+static inline int Fp32Relu6AVX(int index, const float *src, int length, float *dst) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    SIMD_F32 six = SIMD_MOV_F32(6.0f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six));
+    }
+    return index;
+}
+
+static inline int LReluAVX(int index, const float *src, int length, float *dst, float alpha) {
+    SIMD_F32 alpha_data = SIMD_MOV_F32(alpha);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask));
+    }
+    return index;
+}
+
+static inline int SigmoidAVX(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index);
+        SIMD_ST_F32(dst + index,
+                    SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
+    }
+    return index;
+}
+
+static inline int TanhAVX(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 input = SIMD_LD_F32(src + index);
+        SIMD_ST_F32(dst + index, SIMD_TANH_F32(input));
+    }
+    return index;
+}
+
+static inline int SwishAVX(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index);
+        SIMD_ST_F32(dst + index,
+                    SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
+    }
+    return index;
+}
+
+static inline int HSwishAVX(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6));
+    }
+    return index;
+}
+
+static inline int HSigmoidAVX(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6));
+    }
+    return index;
+}
+
+static inline int HardTanhNoLimitMinAVX(int index, const float *src, int length, float *dst, float min_val,
+                                            float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val));
+    }
+    return index;
+}
+
+static inline int HardTanhNoLimitMaxAVX(int index, const float *src, int length, float *dst, float min_val,
+                                            float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val));
+    }
+    return index;
+}
+
+static inline int HardTanhLimitMinMaxAVX(int index, const float *src, int length, float *dst, float min_val,
+                                             float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val));
+    }
+    return index;
+}
+
+static inline int GeluApproximateAVX(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 in = SIMD_LD_F32(src + index);
+        SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in);
+        SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in);
+        SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f)));
+    }
+    return index;
+}
+
+static inline int GeluAVX(int index, const float *src, int length, float *dst) {
+    SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f);
+    SIMD_F32 para2 = SIMD_MOV_F32(1.0f);
+    SIMD_F32 para3 = SIMD_MOV_F32(0.5f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 in = SIMD_LD_F32(src + index);
+      SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1))));
+      SIMD_ST_F32(dst + index, res);
+    }
+    return index;
+}
+
+static inline int EluAVX(int index, const float *src, int length, float *dst, float alpha) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f);
+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
+    }
+    return index;
+}
+
+static inline int CeluAVX(int index, const float *src, int length, float *dst, float alpha) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f);
+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
+    }
+    return index;
+}
+
+static inline int HShrinkAVX(int index, const float *src, int length, float *dst, float lambd) {
+    const float neg_lambd = -1 * lambd;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd));
+        SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp);
+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask));
+    }
+    return index;
+}
+
+static inline int SoftShrinkAVX(int index, const float *src, int length, float *dst, float lambd) {
+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
+
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_t = SIMD_LD_F32(src + index);
+        /* v0 = (in > lamdb) & (in - lamdb) */
+        SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v));
+        /* v1 = (in < -lamdb) & (in + lamdb) */
+        SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v));
+        /* out = (v0 | v1) */
+        SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1));
+    }
+    return index;
+}
+
+static inline int SoftsignFp32OptAVX(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp));
+        SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp));
+    }
+    return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h
new file mode 100644
index 00000000..435d24c5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX_H_
+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int ShrinkGradAVX(int index, const float *src0, const float *src1,
+                                               int length, float *dst, float lambd) {
+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
+
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src0_t = SIMD_LD_F32(src0 + index);
+        SIMD_F32 src1_t = SIMD_LD_F32(src1 + index);
+
+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v);
+        SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t);
+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
+
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask));
+    }
+    return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h
new file mode 100644
index 00000000..54743d80
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h
@@ -0,0 +1,210 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_AVX_H_
+#define MINDSPORE_NNACL_FP32_ADAM_FP32_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+#ifdef MS_SIMD_AVX512
+  static inline size_t AdamWeightDecayFp32AVX(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    const float *gradient, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient + index);
+
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_F32(var + index, var_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp32Fp16AVX(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
+
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(var + index, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp32Fp32AVX(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
+
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(var + index, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp16Fp16AVX(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp16Fp32AVX(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
+  }
+
+  return index;
+}
+#endif
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h
new file mode 100644
index 00000000..716c25b1
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h
@@ -0,0 +1,124 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_ADD_AVX_H_
+#define MINDSPORE_NNACL_FP32_ADD_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int ElementOptAddAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddIntAVX(int index, const int *in0, const int *in1, int *out,
+                                                     int size) {
+  SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddReluAVX(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddRelu6AVX(int index, const float *in0, const float *in1, float *out,
+                                                       int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddReluAVX(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddRelu6AVX(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h
new file mode 100644
index 00000000..9dd24100
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h
@@ -0,0 +1,254 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_ARITHMETIC_AVX_H_
+#define MINDSPORE_NNACL_ARITHMETIC_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+#ifndef MS_SIMD_NEON
+static inline int ElementFloorModAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorModNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorModNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementFloorDivAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, floor_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+#endif
+
+static inline int ElementFloorDivIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMaximumAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMaximumIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMinimumIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMinimumAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h
new file mode 100644
index 00000000..c48500f4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h
@@ -0,0 +1,129 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_AVX_H_
+#define MINDSPORE_NNACL_ARITHMETIC_SELF_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+#if defined(MS_SIMD_AVX512)
+// only avx512 support abs fp32 instruction
+static inline int ElementAbsAVX(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+static inline int ElementAbsIntAVX(int index, const int *input, int *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index)));
+  }
+  return index;
+}
+#endif
+
+static inline int ElementSquareAVX(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin = SIMD_LD_F32(input + index);
+    SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin));
+  }
+  return index;
+}
+
+static inline int ElementSqrtAVX(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+static inline int ElementRsqrtAVX(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE)
+// avx512 dont support round fp32 instruction
+static inline int ElementRoundAVX(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+#ifndef MS_SIMD_NEON
+// neon dont support floor fp32 instruction
+static inline int ElementFloorAVX(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+#ifndef MS_SIMD_NEON
+static inline int ElementCeilAVX(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+static inline int ElementNegativeAVX(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f));
+  }
+  return index;
+}
+
+static inline int ElementNegativeIntAVX(int index, const int *input, int *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1));
+  }
+  return index;
+}
+
+static inline int ElementReciprocalAVX(int index, const float *input, float *output, const int element_size) {
+  SIMD_F32 num1 = SIMD_MOV_F32(1.0f);
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h
new file mode 100644
index 00000000..11a9087b
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int BatchNormFp32AVX(int index, const float *input, const float *mean,
+  const float *variance, int channel, float epsilon, float *output) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
+    SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
+    SIMD_ST_F32(output + index, output_data);
+  }
+  return index;
+}
+
+static inline int FusedBatchNormFp32AVX(int index, const float *input, const float *scale,
+  const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
+    SIMD_F32 scale_ = SIMD_LD_F32(scale + index);
+    SIMD_F32 offset_ = SIMD_LD_F32(offset + index);
+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
+    SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_);
+    SIMD_ST_F32(output + index, output_data);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h
new file mode 100644
index 00000000..9da68a79
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX_H_
+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int BCEWithLogitLossAVX(int index, const float *logits, const float *label,
+    const float *weight, const float *pos_weight, int length, bool reduction, float *output,
+    float *reduction_sum) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    SIMD_F32 ones = SIMD_MOV_F32(1.0f);
+    SIMD_F32 middle_output = SIMD_SET0_F32;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index);
+      SIMD_F32 label_tmp = SIMD_LD_F32(label + index);
+      SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index);
+      SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index);
+      SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp);
+      SIMD_F32 max_value = neg_logits_tmp;
+      max_value = SIMD_MIN_F32(max_value, zero);
+      SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value);
+      SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones);
+      SIMD_F32 log_exp_value =
+        SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value))));
+      SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp),
+                                    SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value)));
+      if (reduction) {
+        middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output);
+      } else {
+        SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp));
+      }
+    }
+    if (reduction) {
+      *reduction_sum += SIMD_GET_SUM_F32(middle_output);
+    }
+    return index;
+}
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h
new file mode 100644
index 00000000..e54588bb
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_AVX_H_
+#define MINDSPORE_NNACL_FP32_BIAS_ADD_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int BiasAddByInnerCoreAVX(int index, const float *input, const float *bias, float *output,
+                                                       int64_t num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(input + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(bias + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
+    SIMD_ST_F32(output + index, vout);
+  }
+  return index;
+}
+
+static inline int BiasAddByBatchCoreAVX(int index, const float *input, const float *bias, float *output1,
+                                                       float *output2, float *output3, float *output4, int64_t num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_LDX4_F32(input_data, input + index, num);
+    SIMD_F32 bias_data = SIMD_LD_F32(bias + index);
+    SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data));
+    SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data));
+    SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data));
+    SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+};
+#endif
+
+#endif  // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h
new file mode 100644
index 00000000..44176549
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_AVX_H_
+#define MINDSPORE_NNACL_BASE_CAST_BASE_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int Int32ToFloat32AVX(int index, const int32_t *input, float *output, int number) {
+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 value = SIMD_LD_EPI32(input + index);
+    SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value));
+  }
+  return index;
+}
+
+#ifndef MS_SIMD_NEON
+static inline int Float32ToInt32AVX(int index, const float *input, int32_t *output, int number) {
+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 value = SIMD_LD_F32(input + index);
+    SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value));
+  }
+  return index;
+}
+#endif
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h
new file mode 100644
index 00000000..dac9efa9
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_CDIST_AVX_H_
+#define MINDSPORE_NNACL_FP32_CDIST_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int64_t CdistTwoNormalOptAVX(int64_t index, const float *a, const float *b,
+                                                          float *out, int64_t size) {
+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
+    tmp_vec = SIMD_ABS_F32(tmp_vec);
+    result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec);
+  }
+  *out += SIMD_GET_SUM_F32(result_vec);
+
+  return index;
+}
+
+static inline int64_t CdistPNormalOptAVX(int64_t index, const float *a, const float *b,
+                                                        float *out, int64_t size, float p) {
+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
+  SIMD_F32 p_vec = SIMD_MOV_F32(p);
+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
+    tmp_vec = SIMD_ABS_F32(tmp_vec);
+    tmp_vec = SIMD_POW_F32(tmp_vec, p_vec);
+    result_vec = SIMD_ADD_F32(tmp_vec, result_vec);
+  }
+  *out += SIMD_GET_SUM_F32(result_vec);
+
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h
new file mode 100644
index 00000000..7407942f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h
@@ -0,0 +1,121 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_AVX_H_
+#define MINDSPORE_NNACL_FP32_CUMSUM_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+static inline int64_t CumsumOutputInitWithInputAVX(int64_t index, const float *layer_input,
+  float *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index));
+  }
+  return index;
+}
+
+static inline int64_t CumsumOutputInitWithZeroAVX(int64_t index, float *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f));
+  }
+  return index;
+}
+
+static inline int64_t CumsumAVX(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_val = SIMD_LD_F32(layer_input + index);
+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index);
+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
+    SIMD_ST_F32(layer_output + index, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+static inline int64_t CumsumReverseAVX(int64_t index, const float *layer_input, float *layer_output,
+  float *layer_last_output, int inner_dim) {
+
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1);
+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1);
+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
+    SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+static inline int64_t CumsumIntOutputInitWithInputAVX(int64_t index, const int *layer_input,
+  int *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index));
+  }
+  return index;
+}
+
+static inline int64_t CumsumIntOutputInitWithZeroAVX(int64_t index, int *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f));
+  }
+  return index;
+}
+
+static inline int64_t CumsumIntAVX(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index);
+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index);
+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
+    SIMD_ST_EPI32(layer_output + index, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+static inline int64_t CumsumReverseIntAVX(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1);
+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1);
+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
+    SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h
new file mode 100644
index 00000000..3710151e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX_H_
+#define MINDSPORE_NNACL_FP32_DIV_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int ElementOptDivNum0AVX(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivNum1AVX(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivReluNum0AVX(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivReluNum1AVX(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivRelu6Num0AVX(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivRelu6Num1AVX(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivReluAVX(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivRelu6AVX(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h
new file mode 100644
index 00000000..cbd4eca5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX_H_
+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int DropoutFp32AVX(int index, const float *input, float scale,
+    int length, float *output) {
+    SIMD_F32 scale_value = SIMD_MOV_F32(scale);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value));
+    }
+    return index;
+}
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h
new file mode 100644
index 00000000..cf7cbd37
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX_H_
+#define MINDSPORE_NNACL_FP32_DIV_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int64_t ExpFp32AVX(int64_t index, const float *src, float *dst, int num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
+  }
+  return index;
+}
+
+static inline int64_t ExpFp32WithInScaleAVX(int64_t index, const float *src, float *dst, int num, float in_scale) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index);
+  }
+  return index;
+}
+
+static inline int64_t ExpFp32WithOutScaleAVX(int64_t index, const float *src, float *dst, int num, float out_scale) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
+    SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h
new file mode 100644
index 00000000..8b01844e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_AVX_H_
+#define MINDSPORE_NNACL_BASE_FILL_BASE_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int FillFp32AVX(int index, float *output, int size, float data) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_MOV_F32(data));
+  }
+  return index;
+}
+
+static inline int FillInt32AVX(int index, int *output, int size, int data) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h
new file mode 100644
index 00000000..d5076e59
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX_H_
+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int64_t GroupNormFp32AVX(int64_t index, const float *unit_input, float scale, float offset, float mean,
+  float var_sqrt, int unit, float *unit_output) {
+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+  SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt);
+  SIMD_F32 scale_val = SIMD_MOV_F32(scale);
+  SIMD_F32 offset_val = SIMD_MOV_F32(offset);
+  for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(unit_input + index);
+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt);
+    SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val);
+    SIMD_ST_F32(unit_output + index, output);
+  }
+  return index;
+}
+
+static inline int64_t GroupNormReduceSumAVX(int64_t index, const float *in, float *sum, int unit) {
+  if (unit - index >= 4 * BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index));
+    }
+    *sum += SIMD_GET_SUM_F32(tmp);
+  }
+  return index;
+}
+
+static inline int64_t GroupNormReduceVarAVX(int64_t index, const float *in, float mean, float *sum, int unit) {
+  if (unit - index >= 4 * BLOCK_NUM) {
+    SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val);
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input));
+    }
+    *sum += SIMD_GET_SUM_F32(tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h
new file mode 100644
index 00000000..96fdf185
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX_H_
+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int LayerNormMeanAndSquareAVX(int index, const float *src, int num, float *mean, float *square_mean) {
+  if (num >= 4 * BLOCK_NUM) {
+    SIMD_F32 sum_val = SIMD_SET0_F32;
+    SIMD_F32 square_sum_val = SIMD_SET0_F32;
+    for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 value = SIMD_LD_F32(src + index);
+      SIMD_F32 square_value = SIMD_MUL_F32(value, value);
+      sum_val = SIMD_ADD_F32(sum_val, value);
+      square_sum_val = SIMD_ADD_F32(square_sum_val, square_value);
+    }
+    *mean += SIMD_GET_SUM_F32(sum_val);
+    *square_mean += SIMD_GET_SUM_F32(square_sum_val);
+  }
+  return index;
+}
+
+static inline int LayerNormGammaAndBetaAVX(int index, float *dst, const float *src, const float *gamma_data,
+  const float *beta_data, int num, const float mean, const float deno) {
+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+  SIMD_F32 deno_val = SIMD_MOV_F32(deno);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 value = SIMD_LD_F32(src + index);
+    SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val);
+    out_value = SIMD_MUL_F32(out_value, deno_val);
+    out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index));
+    SIMD_ST_F32(dst + index, out_value);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h
new file mode 100644
index 00000000..523e120e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h
@@ -0,0 +1,93 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_AVX_H_
+#define MINDSPORE_NNACL_FP32_MATMUL_F32_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6.
+static inline int64_t GemmIsNotPackAVX(int64_t index, const float *a, const float *b, float *c, const float *bias, int row,
+  int deep, int act_type) {
+  SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f);
+  SIMD_F32 up_threshold = SIMD_MOV_F32(6);
+  SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]);
+  SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]);
+  for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_data = SIMD_LD_F32(a + index);
+    SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16);
+    if (act_type != 0) {
+      dst = SIMD_MAX_F32(dst, down_threshold);
+      if (act_type == 3) {
+        dst = SIMD_MIN_F32(dst, up_threshold);
+      }
+    }
+    SIMD_ST_F32(c + index, dst);
+  }
+
+  return index;
+}
+
+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX)
+static inline int64_t GemmIsNotPackOptimizeCoreAVX(int64_t index, const float *a, const float *b, int k, float *dst) {
+  SIMD_F32 dst1 = SIMD_MOV_F32(0.0f);
+  for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 weight = SIMD_LD_F32(b + index);
+    SIMD_F32 a1 = SIMD_LD_F32(a + index);
+    dst1 = SIMD_FMADD_F32(weight, a1, dst1);
+  }
+  *dst += SIMD_REDUCE_ADD_F32(dst1);
+  return index;
+}
+#endif
+
+static inline int64_t MatVecMulNoPackCoreAVX(int64_t oc_index, const float *a, const float *b, float *c, const float *bias,
+  int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) {
+  for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) {
+    SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index));
+    for (int64_t k = 0; k < depth; ++k) {
+      SIMD_F32 left = SIMD_MOV_F32(a[k]);
+      SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col);
+      out = SIMD_FMADD_F32(left, right, out);
+    }
+    if ((inc_flag & 0x2) != 0 && act_type != 0) {
+      out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f));
+      if (act_type == 0x3) {
+        out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f));
+      }
+    }
+    SIMD_ST_F32(c + oc_index, out);
+  }
+  return oc_index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h
new file mode 100644
index 00000000..a5d8b0a0
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h
@@ -0,0 +1,218 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int ElementMulAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulReluAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulRelu6AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulReluIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulRelu6IntAVX(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6Num0AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6Num1AVX(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6IntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6IntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h
new file mode 100644
index 00000000..d4bd2305
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_POOLING_AVX_H_
+#define MINDSPORE_NNACL_FP32_POOLING_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int AvgPoolingBatchAVX(int ci, const float *src_plane_ptr, int channel,
+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
+    const float *src_c_ptr = src_plane_ptr + ci;
+    float *dst_c_ptr = dst_plane_ptr + ci;
+    SIMD_F32 tmp_avg = SIMD_SET0_F32;
+    int real_count = 0;
+    for (int h = real_win_h_start; h < real_win_h_end; h++) {
+      for (int w = real_win_w_start; w < real_win_w_end; w++) {
+        const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+        tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr));
+        ++real_count;
+      }
+    }
+    tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count));
+    tmp_avg = SIMD_MAX_F32(tmp_avg, min_val);
+    tmp_avg = SIMD_MIN_F32(tmp_avg, max_val);
+    SIMD_ST_F32(dst_c_ptr, tmp_avg);
+  }
+  return ci;
+}
+
+static inline int MaxPoolingBatchAVX(int ci, const float *src_plane_ptr, int channel,
+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
+    const float *src_c_ptr = src_plane_ptr + ci;
+    float *dst_c_ptr = dst_plane_ptr + ci;
+    SIMD_F32 tmp_max = min_val;
+    for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+      for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
+        const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+        tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr));
+      }
+    }
+    tmp_max = SIMD_MIN_F32(tmp_max, max_val);
+    SIMD_ST_F32(dst_c_ptr, tmp_max);
+  }
+  return ci;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h
new file mode 100644
index 00000000..2ada6cb3
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_POWER_AVX_H_
+#define MINDSPORE_NNACL_FP32_POWER_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int PowerBroadCastIntExponentAVX(int index, const float *input, int exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    SIMD_F32 result = SIMD_MOV_F32(1.0f);
+    int exp = abs(exponent);
+    while (exp) {
+      if (exp % 2) {
+        result = SIMD_MUL_F32(result, tmp);
+      }
+      tmp = SIMD_MUL_SQUARE_F32(tmp);
+      exp = exp / 2;
+    }
+    SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result));
+  }
+  return index;
+}
+
+static inline int PowerBroadCastFloatExponentAVX(int index, const float *input, float exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    SIMD_F32 result;
+    for (int i = 0; i < BLOCK_NUM; ++i) {
+      SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent);
+    }
+    SIMD_ST_F32(output + index, result);
+  }
+  return index;
+}
+
+static inline int PowerSingleExponentAVX(int index, const float *input, const float *exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    for (int j = 0; j < BLOCK_NUM; ++j) {
+      float cur_exponent = exponent[index + j];
+      float cur_val = SIMD_F32_GETI(tmp_vec, j);
+      if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) {
+        int exp = abs((int)(cur_exponent));
+        float result = 1;
+        while (exp) {
+          if (exp % 2) {
+            result *= cur_val;
+          }
+          cur_val *= cur_val;
+          exp = exp / 2;
+        }
+        output[index + j] = *exponent >= 0 ? result : 1 / result;
+      } else {
+        output[index + j] = powf(cur_val, cur_exponent);
+      }
+    }
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h
new file mode 100644
index 00000000..03339e42
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h
@@ -0,0 +1,181 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX_H_
+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int64_t ReduceSumAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceMeanAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size));
+  }
+  return index;
+}
+
+static inline int64_t ReduceMinAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceMaxAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceProdAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(1.0f);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceSumSquareAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceL2NormAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
+    }
+    SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp));
+  }
+  return index;
+}
+
+static inline int64_t IntReduceSumAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMeanAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size));
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMinAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMaxAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h
new file mode 100644
index 00000000..8229111d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_AVX_H_
+#define MINDSPORE_NNACL_FP32_SOFTMAX_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int64_t SoftmaxNormGetMaxAVX(int64_t index, const float *src, int cur_batch_offset,
+  float *max, int channel) {
+  if (channel >= BLOCK_NUM * BLOCK_NUM) {
+    SIMD_F32 max_val = SIMD_MOV_F32(*max);
+    for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index));
+    }
+    *max = SIMD_GET_MAX_F32(max_val);
+  }
+  return index;
+}
+
+static inline int64_t SoftmaxNormCalcNormAVX(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float max, int channel) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max));
+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
+  }
+  return index;
+}
+
+static inline int64_t SoftmaxLastAxisGetExpSumAVX(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float max, float *exp_sum, int channel) {
+#ifndef _WIN32
+  SIMD_F32 sum_val = SIMD_SET0_F32;
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
+    SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max));
+    SIMD_F32 exp_out = SIMD_EXP_F32(output);
+    sum_val = SIMD_ADD_F32(sum_val, exp_out);
+    SIMD_ST_F32(dst + cur_batch_offset + index, exp_out);
+  }
+  *exp_sum += SIMD_GET_SUM_F32(sum_val);
+#endif
+  return index;
+}
+
+static inline int64_t SoftmaxLastAxisGetResultAVX(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float exp_sum, int channel) {
+  SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum);
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
+    SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val);
+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h
new file mode 100644
index 00000000..a3ed93d4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_SUB_AVX_H_
+#define MINDSPORE_NNACL_FP32_SUB_AVX_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx", "avx2")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
+#define BLOCK_NUM 8
+#define MS_SIMD_AVX
+
+static inline int ElementOptSubNum0AVX(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubNum1AVX(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubReluNum0AVX(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubReluNum1AVX(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubRelu6Num0AVX(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubRelu6Num1AVX(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubAVX(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubReluAVX(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubRelu6AVX(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h
new file mode 100644
index 00000000..f6457628
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h
@@ -0,0 +1,221 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int Fp32ReluAVX512(int index, const float *src, int length, float *dst) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero));
+    }
+    return index;
+}
+
+static inline int Int32ReluAVX512(int index, const int32_t *src, int length, int32_t *dst) {
+    SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero));
+    }
+    return index;
+}
+
+static inline int Fp32Relu6AVX512(int index, const float *src, int length, float *dst) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    SIMD_F32 six = SIMD_MOV_F32(6.0f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six));
+    }
+    return index;
+}
+
+static inline int LReluAVX512(int index, const float *src, int length, float *dst, float alpha) {
+    SIMD_F32 alpha_data = SIMD_MOV_F32(alpha);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask));
+    }
+    return index;
+}
+
+static inline int SigmoidAVX512(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index);
+        SIMD_ST_F32(dst + index,
+                    SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
+    }
+    return index;
+}
+
+static inline int TanhAVX512(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 input = SIMD_LD_F32(src + index);
+        SIMD_ST_F32(dst + index, SIMD_TANH_F32(input));
+    }
+    return index;
+}
+
+static inline int SwishAVX512(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index);
+        SIMD_ST_F32(dst + index,
+                    SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
+    }
+    return index;
+}
+
+static inline int HSwishAVX512(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6));
+    }
+    return index;
+}
+
+static inline int HSigmoidAVX512(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6));
+    }
+    return index;
+}
+
+static inline int HardTanhNoLimitMinAVX512(int index, const float *src, int length, float *dst, float min_val,
+                                            float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val));
+    }
+    return index;
+}
+
+static inline int HardTanhNoLimitMaxAVX512(int index, const float *src, int length, float *dst, float min_val,
+                                            float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val));
+    }
+    return index;
+}
+
+static inline int HardTanhLimitMinMaxAVX512(int index, const float *src, int length, float *dst, float min_val,
+                                             float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val));
+    }
+    return index;
+}
+
+static inline int GeluApproximateAVX512(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 in = SIMD_LD_F32(src + index);
+        SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in);
+        SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in);
+        SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f)));
+    }
+    return index;
+}
+
+static inline int GeluAVX512(int index, const float *src, int length, float *dst) {
+    SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f);
+    SIMD_F32 para2 = SIMD_MOV_F32(1.0f);
+    SIMD_F32 para3 = SIMD_MOV_F32(0.5f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 in = SIMD_LD_F32(src + index);
+      SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1))));
+      SIMD_ST_F32(dst + index, res);
+    }
+    return index;
+}
+
+static inline int EluAVX512(int index, const float *src, int length, float *dst, float alpha) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f);
+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
+    }
+    return index;
+}
+
+static inline int CeluAVX512(int index, const float *src, int length, float *dst, float alpha) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f);
+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
+    }
+    return index;
+}
+
+static inline int HShrinkAVX512(int index, const float *src, int length, float *dst, float lambd) {
+    const float neg_lambd = -1 * lambd;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd));
+        SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp);
+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask));
+    }
+    return index;
+}
+
+static inline int SoftShrinkAVX512(int index, const float *src, int length, float *dst, float lambd) {
+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
+
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_t = SIMD_LD_F32(src + index);
+        /* v0 = (in > lamdb) & (in - lamdb) */
+        SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v));
+        /* v1 = (in < -lamdb) & (in + lamdb) */
+        SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v));
+        /* out = (v0 | v1) */
+        SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1));
+    }
+    return index;
+}
+
+static inline int SoftsignFp32OptAVX512(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp));
+        SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp));
+    }
+    return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h
new file mode 100644
index 00000000..62d34db4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX512_H_
+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int ShrinkGradAVX512(int index, const float *src0, const float *src1,
+                                               int length, float *dst, float lambd) {
+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
+
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src0_t = SIMD_LD_F32(src0 + index);
+        SIMD_F32 src1_t = SIMD_LD_F32(src1 + index);
+
+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v);
+        SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t);
+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
+
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask));
+    }
+    return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h
new file mode 100644
index 00000000..0579d58a
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h
@@ -0,0 +1,210 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_AVX512_H_
+#define MINDSPORE_NNACL_FP32_ADAM_FP32_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+#ifdef MS_SIMD_AVX512
+  static inline size_t AdamWeightDecayFp32AVX512(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    const float *gradient, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient + index);
+
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_F32(var + index, var_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp32Fp16AVX512(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
+
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(var + index, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp32Fp32AVX512(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
+
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(var + index, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp16Fp16AVX512(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp16Fp32AVX512(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
+  }
+
+  return index;
+}
+#endif
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h
new file mode 100644
index 00000000..5ec6a42e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h
@@ -0,0 +1,124 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_ADD_AVX512_H_
+#define MINDSPORE_NNACL_FP32_ADD_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int ElementOptAddAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddIntAVX512(int index, const int *in0, const int *in1, int *out,
+                                                     int size) {
+  SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddReluAVX512(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddRelu6AVX512(int index, const float *in0, const float *in1, float *out,
+                                                       int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddReluAVX512(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddRelu6AVX512(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h
new file mode 100644
index 00000000..aa478969
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h
@@ -0,0 +1,254 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_ARITHMETIC_AVX512_H_
+#define MINDSPORE_NNACL_ARITHMETIC_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+#ifndef MS_SIMD_NEON
+static inline int ElementFloorModAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorModNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorModNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementFloorDivAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, floor_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+#endif
+
+static inline int ElementFloorDivIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMaximumAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMaximumIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMinimumIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMinimumAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h
new file mode 100644
index 00000000..c671e327
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h
@@ -0,0 +1,129 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_AVX512_H_
+#define MINDSPORE_NNACL_ARITHMETIC_SELF_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+#if defined(MS_SIMD_AVX512)
+// only avx512 support abs fp32 instruction
+static inline int ElementAbsAVX512(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+static inline int ElementAbsIntAVX512(int index, const int *input, int *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index)));
+  }
+  return index;
+}
+#endif
+
+static inline int ElementSquareAVX512(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin = SIMD_LD_F32(input + index);
+    SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin));
+  }
+  return index;
+}
+
+static inline int ElementSqrtAVX512(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+static inline int ElementRsqrtAVX512(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE)
+// avx512 dont support round fp32 instruction
+static inline int ElementRoundAVX512(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+#ifndef MS_SIMD_NEON
+// neon dont support floor fp32 instruction
+static inline int ElementFloorAVX512(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+#ifndef MS_SIMD_NEON
+static inline int ElementCeilAVX512(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+static inline int ElementNegativeAVX512(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f));
+  }
+  return index;
+}
+
+static inline int ElementNegativeIntAVX512(int index, const int *input, int *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1));
+  }
+  return index;
+}
+
+static inline int ElementReciprocalAVX512(int index, const float *input, float *output, const int element_size) {
+  SIMD_F32 num1 = SIMD_MOV_F32(1.0f);
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h
new file mode 100644
index 00000000..fd945984
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int BatchNormFp32AVX512(int index, const float *input, const float *mean,
+  const float *variance, int channel, float epsilon, float *output) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
+    SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
+    SIMD_ST_F32(output + index, output_data);
+  }
+  return index;
+}
+
+static inline int FusedBatchNormFp32AVX512(int index, const float *input, const float *scale,
+  const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
+    SIMD_F32 scale_ = SIMD_LD_F32(scale + index);
+    SIMD_F32 offset_ = SIMD_LD_F32(offset + index);
+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
+    SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_);
+    SIMD_ST_F32(output + index, output_data);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h
new file mode 100644
index 00000000..f5353f61
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX512_H_
+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int BCEWithLogitLossAVX512(int index, const float *logits, const float *label,
+    const float *weight, const float *pos_weight, int length, bool reduction, float *output,
+    float *reduction_sum) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    SIMD_F32 ones = SIMD_MOV_F32(1.0f);
+    SIMD_F32 middle_output = SIMD_SET0_F32;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index);
+      SIMD_F32 label_tmp = SIMD_LD_F32(label + index);
+      SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index);
+      SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index);
+      SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp);
+      SIMD_F32 max_value = neg_logits_tmp;
+      max_value = SIMD_MIN_F32(max_value, zero);
+      SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value);
+      SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones);
+      SIMD_F32 log_exp_value =
+        SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value))));
+      SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp),
+                                    SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value)));
+      if (reduction) {
+        middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output);
+      } else {
+        SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp));
+      }
+    }
+    if (reduction) {
+      *reduction_sum += SIMD_GET_SUM_F32(middle_output);
+    }
+    return index;
+}
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h
new file mode 100644
index 00000000..abdad5ff
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_AVX512_H_
+#define MINDSPORE_NNACL_FP32_BIAS_ADD_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int BiasAddByInnerCoreAVX512(int index, const float *input, const float *bias, float *output,
+                                                       int64_t num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(input + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(bias + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
+    SIMD_ST_F32(output + index, vout);
+  }
+  return index;
+}
+
+static inline int BiasAddByBatchCoreAVX512(int index, const float *input, const float *bias, float *output1,
+                                                       float *output2, float *output3, float *output4, int64_t num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_LDX4_F32(input_data, input + index, num);
+    SIMD_F32 bias_data = SIMD_LD_F32(bias + index);
+    SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data));
+    SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data));
+    SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data));
+    SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+};
+#endif
+
+#endif  // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h
new file mode 100644
index 00000000..91d52718
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_AVX512_H_
+#define MINDSPORE_NNACL_BASE_CAST_BASE_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int Int32ToFloat32AVX512(int index, const int32_t *input, float *output, int number) {
+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 value = SIMD_LD_EPI32(input + index);
+    SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value));
+  }
+  return index;
+}
+
+#ifndef MS_SIMD_NEON
+static inline int Float32ToInt32AVX512(int index, const float *input, int32_t *output, int number) {
+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 value = SIMD_LD_F32(input + index);
+    SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value));
+  }
+  return index;
+}
+#endif
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h
new file mode 100644
index 00000000..11a2abcf
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_CDIST_AVX512_H_
+#define MINDSPORE_NNACL_FP32_CDIST_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int64_t CdistTwoNormalOptAVX512(int64_t index, const float *a, const float *b,
+                                                          float *out, int64_t size) {
+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
+    tmp_vec = SIMD_ABS_F32(tmp_vec);
+    result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec);
+  }
+  *out += SIMD_GET_SUM_F32(result_vec);
+
+  return index;
+}
+
+static inline int64_t CdistPNormalOptAVX512(int64_t index, const float *a, const float *b,
+                                                        float *out, int64_t size, float p) {
+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
+  SIMD_F32 p_vec = SIMD_MOV_F32(p);
+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
+    tmp_vec = SIMD_ABS_F32(tmp_vec);
+    tmp_vec = SIMD_POW_F32(tmp_vec, p_vec);
+    result_vec = SIMD_ADD_F32(tmp_vec, result_vec);
+  }
+  *out += SIMD_GET_SUM_F32(result_vec);
+
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h
new file mode 100644
index 00000000..f82adabf
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h
@@ -0,0 +1,121 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_AVX512_H_
+#define MINDSPORE_NNACL_FP32_CUMSUM_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+static inline int64_t CumsumOutputInitWithInputAVX512(int64_t index, const float *layer_input,
+  float *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index));
+  }
+  return index;
+}
+
+static inline int64_t CumsumOutputInitWithZeroAVX512(int64_t index, float *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f));
+  }
+  return index;
+}
+
+static inline int64_t CumsumAVX512(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_val = SIMD_LD_F32(layer_input + index);
+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index);
+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
+    SIMD_ST_F32(layer_output + index, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+static inline int64_t CumsumReverseAVX512(int64_t index, const float *layer_input, float *layer_output,
+  float *layer_last_output, int inner_dim) {
+
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1);
+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1);
+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
+    SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+static inline int64_t CumsumIntOutputInitWithInputAVX512(int64_t index, const int *layer_input,
+  int *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index));
+  }
+  return index;
+}
+
+static inline int64_t CumsumIntOutputInitWithZeroAVX512(int64_t index, int *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f));
+  }
+  return index;
+}
+
+static inline int64_t CumsumIntAVX512(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index);
+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index);
+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
+    SIMD_ST_EPI32(layer_output + index, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+static inline int64_t CumsumReverseIntAVX512(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1);
+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1);
+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
+    SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h
new file mode 100644
index 00000000..4de588fb
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX512_H_
+#define MINDSPORE_NNACL_FP32_DIV_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int ElementOptDivNum0AVX512(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivNum1AVX512(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivReluNum0AVX512(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivReluNum1AVX512(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivReluAVX512(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivRelu6AVX512(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h
new file mode 100644
index 00000000..eb847c23
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX512_H_
+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int DropoutFp32AVX512(int index, const float *input, float scale,
+    int length, float *output) {
+    SIMD_F32 scale_value = SIMD_MOV_F32(scale);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value));
+    }
+    return index;
+}
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h
new file mode 100644
index 00000000..14386f5f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX512_H_
+#define MINDSPORE_NNACL_FP32_DIV_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int64_t ExpFp32AVX512(int64_t index, const float *src, float *dst, int num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
+  }
+  return index;
+}
+
+static inline int64_t ExpFp32WithInScaleAVX512(int64_t index, const float *src, float *dst, int num, float in_scale) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index);
+  }
+  return index;
+}
+
+static inline int64_t ExpFp32WithOutScaleAVX512(int64_t index, const float *src, float *dst, int num, float out_scale) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
+    SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h
new file mode 100644
index 00000000..5eb04746
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_AVX512_H_
+#define MINDSPORE_NNACL_BASE_FILL_BASE_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int FillFp32AVX512(int index, float *output, int size, float data) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_MOV_F32(data));
+  }
+  return index;
+}
+
+static inline int FillInt32AVX512(int index, int *output, int size, int data) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h
new file mode 100644
index 00000000..f26537d9
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX512_H_
+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int64_t GroupNormFp32AVX512(int64_t index, const float *unit_input, float scale, float offset, float mean,
+  float var_sqrt, int unit, float *unit_output) {
+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+  SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt);
+  SIMD_F32 scale_val = SIMD_MOV_F32(scale);
+  SIMD_F32 offset_val = SIMD_MOV_F32(offset);
+  for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(unit_input + index);
+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt);
+    SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val);
+    SIMD_ST_F32(unit_output + index, output);
+  }
+  return index;
+}
+
+static inline int64_t GroupNormReduceSumAVX512(int64_t index, const float *in, float *sum, int unit) {
+  if (unit - index >= 4 * BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index));
+    }
+    *sum += SIMD_GET_SUM_F32(tmp);
+  }
+  return index;
+}
+
+static inline int64_t GroupNormReduceVarAVX512(int64_t index, const float *in, float mean, float *sum, int unit) {
+  if (unit - index >= 4 * BLOCK_NUM) {
+    SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val);
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input));
+    }
+    *sum += SIMD_GET_SUM_F32(tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h
new file mode 100644
index 00000000..e5fb6d7b
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX512_H_
+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int LayerNormMeanAndSquareAVX512(int index, const float *src, int num, float *mean, float *square_mean) {
+  if (num >= 4 * BLOCK_NUM) {
+    SIMD_F32 sum_val = SIMD_SET0_F32;
+    SIMD_F32 square_sum_val = SIMD_SET0_F32;
+    for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 value = SIMD_LD_F32(src + index);
+      SIMD_F32 square_value = SIMD_MUL_F32(value, value);
+      sum_val = SIMD_ADD_F32(sum_val, value);
+      square_sum_val = SIMD_ADD_F32(square_sum_val, square_value);
+    }
+    *mean += SIMD_GET_SUM_F32(sum_val);
+    *square_mean += SIMD_GET_SUM_F32(square_sum_val);
+  }
+  return index;
+}
+
+static inline int LayerNormGammaAndBetaAVX512(int index, float *dst, const float *src, const float *gamma_data,
+  const float *beta_data, int num, const float mean, const float deno) {
+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+  SIMD_F32 deno_val = SIMD_MOV_F32(deno);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 value = SIMD_LD_F32(src + index);
+    SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val);
+    out_value = SIMD_MUL_F32(out_value, deno_val);
+    out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index));
+    SIMD_ST_F32(dst + index, out_value);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h
new file mode 100644
index 00000000..d51779d4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h
@@ -0,0 +1,93 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_AVX512_H_
+#define MINDSPORE_NNACL_FP32_MATMUL_F32_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6.
+static inline int64_t GemmIsNotPackAVX512(int64_t index, const float *a, const float *b, float *c, const float *bias, int row,
+  int deep, int act_type) {
+  SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f);
+  SIMD_F32 up_threshold = SIMD_MOV_F32(6);
+  SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]);
+  SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]);
+  for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_data = SIMD_LD_F32(a + index);
+    SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16);
+    if (act_type != 0) {
+      dst = SIMD_MAX_F32(dst, down_threshold);
+      if (act_type == 3) {
+        dst = SIMD_MIN_F32(dst, up_threshold);
+      }
+    }
+    SIMD_ST_F32(c + index, dst);
+  }
+
+  return index;
+}
+
+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX)
+static inline int64_t GemmIsNotPackOptimizeCoreAVX512(int64_t index, const float *a, const float *b, int k, float *dst) {
+  SIMD_F32 dst1 = SIMD_MOV_F32(0.0f);
+  for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 weight = SIMD_LD_F32(b + index);
+    SIMD_F32 a1 = SIMD_LD_F32(a + index);
+    dst1 = SIMD_FMADD_F32(weight, a1, dst1);
+  }
+  *dst += SIMD_REDUCE_ADD_F32(dst1);
+  return index;
+}
+#endif
+
+static inline int64_t MatVecMulNoPackCoreAVX512(int64_t oc_index, const float *a, const float *b, float *c, const float *bias,
+  int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) {
+  for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) {
+    SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index));
+    for (int64_t k = 0; k < depth; ++k) {
+      SIMD_F32 left = SIMD_MOV_F32(a[k]);
+      SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col);
+      out = SIMD_FMADD_F32(left, right, out);
+    }
+    if ((inc_flag & 0x2) != 0 && act_type != 0) {
+      out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f));
+      if (act_type == 0x3) {
+        out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f));
+      }
+    }
+    SIMD_ST_F32(c + oc_index, out);
+  }
+  return oc_index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h
new file mode 100644
index 00000000..e3b242e4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h
@@ -0,0 +1,218 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int ElementMulAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulReluAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulRelu6AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulReluIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulRelu6IntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6IntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6IntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h
new file mode 100644
index 00000000..d1e001ee
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_POOLING_AVX512_H_
+#define MINDSPORE_NNACL_FP32_POOLING_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int AvgPoolingBatchAVX512(int ci, const float *src_plane_ptr, int channel,
+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
+    const float *src_c_ptr = src_plane_ptr + ci;
+    float *dst_c_ptr = dst_plane_ptr + ci;
+    SIMD_F32 tmp_avg = SIMD_SET0_F32;
+    int real_count = 0;
+    for (int h = real_win_h_start; h < real_win_h_end; h++) {
+      for (int w = real_win_w_start; w < real_win_w_end; w++) {
+        const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+        tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr));
+        ++real_count;
+      }
+    }
+    tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count));
+    tmp_avg = SIMD_MAX_F32(tmp_avg, min_val);
+    tmp_avg = SIMD_MIN_F32(tmp_avg, max_val);
+    SIMD_ST_F32(dst_c_ptr, tmp_avg);
+  }
+  return ci;
+}
+
+static inline int MaxPoolingBatchAVX512(int ci, const float *src_plane_ptr, int channel,
+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
+    const float *src_c_ptr = src_plane_ptr + ci;
+    float *dst_c_ptr = dst_plane_ptr + ci;
+    SIMD_F32 tmp_max = min_val;
+    for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+      for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
+        const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+        tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr));
+      }
+    }
+    tmp_max = SIMD_MIN_F32(tmp_max, max_val);
+    SIMD_ST_F32(dst_c_ptr, tmp_max);
+  }
+  return ci;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h
new file mode 100644
index 00000000..a31eaf2f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_POWER_AVX512_H_
+#define MINDSPORE_NNACL_FP32_POWER_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int PowerBroadCastIntExponentAVX512(int index, const float *input, int exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    SIMD_F32 result = SIMD_MOV_F32(1.0f);
+    int exp = abs(exponent);
+    while (exp) {
+      if (exp % 2) {
+        result = SIMD_MUL_F32(result, tmp);
+      }
+      tmp = SIMD_MUL_SQUARE_F32(tmp);
+      exp = exp / 2;
+    }
+    SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result));
+  }
+  return index;
+}
+
+static inline int PowerBroadCastFloatExponentAVX512(int index, const float *input, float exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    SIMD_F32 result;
+    for (int i = 0; i < BLOCK_NUM; ++i) {
+      SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent);
+    }
+    SIMD_ST_F32(output + index, result);
+  }
+  return index;
+}
+
+static inline int PowerSingleExponentAVX512(int index, const float *input, const float *exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    for (int j = 0; j < BLOCK_NUM; ++j) {
+      float cur_exponent = exponent[index + j];
+      float cur_val = SIMD_F32_GETI(tmp_vec, j);
+      if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) {
+        int exp = abs((int)(cur_exponent));
+        float result = 1;
+        while (exp) {
+          if (exp % 2) {
+            result *= cur_val;
+          }
+          cur_val *= cur_val;
+          exp = exp / 2;
+        }
+        output[index + j] = *exponent >= 0 ? result : 1 / result;
+      } else {
+        output[index + j] = powf(cur_val, cur_exponent);
+      }
+    }
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h
new file mode 100644
index 00000000..5885a044
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h
@@ -0,0 +1,181 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX512_H_
+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int64_t ReduceSumAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceMeanAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size));
+  }
+  return index;
+}
+
+static inline int64_t ReduceMinAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceMaxAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceProdAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(1.0f);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceSumSquareAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceL2NormAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
+    }
+    SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp));
+  }
+  return index;
+}
+
+static inline int64_t IntReduceSumAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMeanAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size));
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMinAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMaxAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h
new file mode 100644
index 00000000..1fa1907e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_AVX512_H_
+#define MINDSPORE_NNACL_FP32_SOFTMAX_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int64_t SoftmaxNormGetMaxAVX512(int64_t index, const float *src, int cur_batch_offset,
+  float *max, int channel) {
+  if (channel >= BLOCK_NUM * BLOCK_NUM) {
+    SIMD_F32 max_val = SIMD_MOV_F32(*max);
+    for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index));
+    }
+    *max = SIMD_GET_MAX_F32(max_val);
+  }
+  return index;
+}
+
+static inline int64_t SoftmaxNormCalcNormAVX512(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float max, int channel) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max));
+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
+  }
+  return index;
+}
+
+static inline int64_t SoftmaxLastAxisGetExpSumAVX512(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float max, float *exp_sum, int channel) {
+#ifndef _WIN32
+  SIMD_F32 sum_val = SIMD_SET0_F32;
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
+    SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max));
+    SIMD_F32 exp_out = SIMD_EXP_F32(output);
+    sum_val = SIMD_ADD_F32(sum_val, exp_out);
+    SIMD_ST_F32(dst + cur_batch_offset + index, exp_out);
+  }
+  *exp_sum += SIMD_GET_SUM_F32(sum_val);
+#endif
+  return index;
+}
+
+static inline int64_t SoftmaxLastAxisGetResultAVX512(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float exp_sum, int channel) {
+  SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum);
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
+    SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val);
+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h
new file mode 100644
index 00000000..994fc7c0
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_SUB_AVX512_H_
+#define MINDSPORE_NNACL_FP32_SUB_AVX512_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
+#define BLOCK_NUM 16
+#define MS_SIMD_AVX512
+
+static inline int ElementOptSubNum0AVX512(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubNum1AVX512(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubReluNum0AVX512(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubReluNum1AVX512(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubAVX512(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubReluAVX512(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubRelu6AVX512(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_AVX512
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h
new file mode 100644
index 00000000..88908c90
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BATCHNORM_FP32_SIMD_H_
+#define MINDSPORE_NNACL_BATCHNORM_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/batchnorm_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/batchnorm_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/batchnorm_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/batchnorm_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h
new file mode 100644
index 00000000..f36981ab
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BCE_WITH_LOGITS_LOSS_FP32_SIMD_H_
+#define MINDSPORE_NNACL_BCE_WITH_LOGITS_LOSS_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/bce_with_logits_loss_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/bce_with_logits_loss_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/bce_with_logits_loss_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/bce_with_logits_loss_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h
new file mode 100644
index 00000000..e765b1eb
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BIAS_ADD_SIMD_H_
+#define MINDSPORE_NNACL_BIAS_ADD_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/bias_add_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/bias_add_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/bias_add_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/bias_add_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h
new file mode 100644
index 00000000..93d8ca33
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_CAST_BASE_SIMD_H_
+#define MINDSPORE_NNACL_CAST_BASE_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/cast_base_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/cast_base_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/cast_base_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/cast_base_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h
new file mode 100644
index 00000000..70f79645
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_CDIST_FP32_SIMD_H_
+#define MINDSPORE_NNACL_CDIST_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/cdist_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/cdist_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/cdist_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/cdist_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h
new file mode 100644
index 00000000..b6979626
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_CUMSUM_FP32_SIMD_H_
+#define MINDSPORE_NNACL_CUMSUM_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/cumsum_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/cumsum_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/cumsum_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/cumsum_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h
new file mode 100644
index 00000000..dcae16ff
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_DIV_FP32_SIMD_H_
+#define MINDSPORE_NNACL_DIV_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/div_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/div_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/div_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/div_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h
new file mode 100644
index 00000000..704591c5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_DROPOUT_FP32_SIMD_H_
+#define MINDSPORE_NNACL_DROPOUT_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/dropout_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/dropout_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/dropout_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/dropout_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h
new file mode 100644
index 00000000..272f5934
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_EXP_FP32_SIMD_H_
+#define MINDSPORE_NNACL_EXP_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/exp_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/exp_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/exp_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/exp_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h
new file mode 100644
index 00000000..f3099405
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FILL_BASE_SIMD_H_
+#define MINDSPORE_NNACL_FILL_BASE_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/fill_base_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/fill_base_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/fill_base_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/fill_base_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h
new file mode 100644
index 00000000..a3931c20
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_GROUP_NORM_FP32_SIMD_H_
+#define MINDSPORE_NNACL_GROUP_NORM_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/group_norm_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/group_norm_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/group_norm_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/group_norm_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h
new file mode 100644
index 00000000..c08461d3
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_LAYER_NORM_FP32_SIMD_H_
+#define MINDSPORE_NNACL_LAYER_NORM_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/layer_norm_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/layer_norm_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/layer_norm_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/layer_norm_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h
new file mode 100644
index 00000000..1250f3fc
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_MATMUL_FP32_SIMD_H_
+#define MINDSPORE_NNACL_MATMUL_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/matmul_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/matmul_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/matmul_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/matmul_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h
new file mode 100644
index 00000000..31e08b08
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_MUL_FP32_SIMD_H_
+#define MINDSPORE_NNACL_MUL_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/mul_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/mul_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/mul_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/mul_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h
new file mode 100644
index 00000000..42d163f6
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h
@@ -0,0 +1,220 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int Fp32ReluNEON(int index, const float *src, int length, float *dst) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero));
+    }
+    return index;
+}
+
+static inline int Int32ReluNEON(int index, const int32_t *src, int length, int32_t *dst) {
+    SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero));
+    }
+    return index;
+}
+
+static inline int Fp32Relu6NEON(int index, const float *src, int length, float *dst) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    SIMD_F32 six = SIMD_MOV_F32(6.0f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six));
+    }
+    return index;
+}
+
+static inline int LReluNEON(int index, const float *src, int length, float *dst, float alpha) {
+    SIMD_F32 alpha_data = SIMD_MOV_F32(alpha);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask));
+    }
+    return index;
+}
+
+static inline int SigmoidNEON(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index);
+        SIMD_ST_F32(dst + index,
+                    SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
+    }
+    return index;
+}
+
+static inline int TanhNEON(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 input = SIMD_LD_F32(src + index);
+        SIMD_ST_F32(dst + index, SIMD_TANH_F32(input));
+    }
+    return index;
+}
+
+static inline int SwishNEON(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index);
+        SIMD_ST_F32(dst + index,
+                    SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
+    }
+    return index;
+}
+
+static inline int HSwishNEON(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6));
+    }
+    return index;
+}
+
+static inline int HSigmoidNEON(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6));
+    }
+    return index;
+}
+
+static inline int HardTanhNoLimitMinNEON(int index, const float *src, int length, float *dst, float min_val,
+                                            float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val));
+    }
+    return index;
+}
+
+static inline int HardTanhNoLimitMaxNEON(int index, const float *src, int length, float *dst, float min_val,
+                                            float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val));
+    }
+    return index;
+}
+
+static inline int HardTanhLimitMinMaxNEON(int index, const float *src, int length, float *dst, float min_val,
+                                             float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val));
+    }
+    return index;
+}
+
+static inline int GeluApproximateNEON(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 in = SIMD_LD_F32(src + index);
+        SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in);
+        SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in);
+        SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f)));
+    }
+    return index;
+}
+
+static inline int GeluNEON(int index, const float *src, int length, float *dst) {
+    SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f);
+    SIMD_F32 para2 = SIMD_MOV_F32(1.0f);
+    SIMD_F32 para3 = SIMD_MOV_F32(0.5f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 in = SIMD_LD_F32(src + index);
+      SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1))));
+      SIMD_ST_F32(dst + index, res);
+    }
+    return index;
+}
+
+static inline int EluNEON(int index, const float *src, int length, float *dst, float alpha) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f);
+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
+    }
+    return index;
+}
+
+static inline int CeluNEON(int index, const float *src, int length, float *dst, float alpha) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f);
+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
+    }
+    return index;
+}
+
+static inline int HShrinkNEON(int index, const float *src, int length, float *dst, float lambd) {
+    const float neg_lambd = -1 * lambd;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd));
+        SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp);
+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask));
+    }
+    return index;
+}
+
+static inline int SoftShrinkNEON(int index, const float *src, int length, float *dst, float lambd) {
+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
+
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_t = SIMD_LD_F32(src + index);
+        /* v0 = (in > lamdb) & (in - lamdb) */
+        SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v));
+        /* v1 = (in < -lamdb) & (in + lamdb) */
+        SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v));
+        /* out = (v0 | v1) */
+        SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1));
+    }
+    return index;
+}
+
+static inline int SoftsignFp32OptNEON(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp));
+        SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp));
+    }
+    return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h
new file mode 100644
index 00000000..df832e51
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_NEON_H_
+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int ShrinkGradNEON(int index, const float *src0, const float *src1,
+                                               int length, float *dst, float lambd) {
+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
+
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src0_t = SIMD_LD_F32(src0 + index);
+        SIMD_F32 src1_t = SIMD_LD_F32(src1 + index);
+
+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v);
+        SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t);
+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
+
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask));
+    }
+    return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h
new file mode 100644
index 00000000..fda41ec2
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h
@@ -0,0 +1,209 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_NEON_H_
+#define MINDSPORE_NNACL_FP32_ADAM_FP32_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+#ifdef MS_SIMD_AVX512
+  static inline size_t AdamWeightDecayFp32NEON(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    const float *gradient, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient + index);
+
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_F32(var + index, var_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp32Fp16NEON(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
+
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(var + index, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp32Fp32NEON(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
+
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(var + index, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp16Fp16NEON(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp16Fp32NEON(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
+  }
+
+  return index;
+}
+#endif
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h
new file mode 100644
index 00000000..4ef32418
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h
@@ -0,0 +1,123 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_ADD_NEON_H_
+#define MINDSPORE_NNACL_FP32_ADD_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int ElementOptAddNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddIntNEON(int index, const int *in0, const int *in1, int *out,
+                                                     int size) {
+  SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddReluNEON(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddRelu6NEON(int index, const float *in0, const float *in1, float *out,
+                                                       int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddReluNEON(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddRelu6NEON(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h
new file mode 100644
index 00000000..2449c07d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h
@@ -0,0 +1,253 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_ARITHMETIC_NEON_H_
+#define MINDSPORE_NNACL_ARITHMETIC_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+#ifndef MS_SIMD_NEON
+static inline int ElementFloorModNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorModNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorModNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementFloorDivNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, floor_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+#endif
+
+static inline int ElementFloorDivIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMaximumNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMaximumIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMinimumIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMinimumNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h
new file mode 100644
index 00000000..682148d7
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h
@@ -0,0 +1,128 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_NEON_H_
+#define MINDSPORE_NNACL_ARITHMETIC_SELF_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+#if defined(MS_SIMD_AVX512)
+// only avx512 support abs fp32 instruction
+static inline int ElementAbsNEON(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+static inline int ElementAbsIntNEON(int index, const int *input, int *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index)));
+  }
+  return index;
+}
+#endif
+
+static inline int ElementSquareNEON(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin = SIMD_LD_F32(input + index);
+    SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin));
+  }
+  return index;
+}
+
+static inline int ElementSqrtNEON(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+static inline int ElementRsqrtNEON(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE)
+// avx512 dont support round fp32 instruction
+static inline int ElementRoundNEON(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+#ifndef MS_SIMD_NEON
+// neon dont support floor fp32 instruction
+static inline int ElementFloorNEON(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+#ifndef MS_SIMD_NEON
+static inline int ElementCeilNEON(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+static inline int ElementNegativeNEON(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f));
+  }
+  return index;
+}
+
+static inline int ElementNegativeIntNEON(int index, const int *input, int *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1));
+  }
+  return index;
+}
+
+static inline int ElementReciprocalNEON(int index, const float *input, float *output, const int element_size) {
+  SIMD_F32 num1 = SIMD_MOV_F32(1.0f);
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h
new file mode 100644
index 00000000..5e169d62
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int BatchNormFp32NEON(int index, const float *input, const float *mean,
+  const float *variance, int channel, float epsilon, float *output) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
+    SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
+    SIMD_ST_F32(output + index, output_data);
+  }
+  return index;
+}
+
+static inline int FusedBatchNormFp32NEON(int index, const float *input, const float *scale,
+  const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
+    SIMD_F32 scale_ = SIMD_LD_F32(scale + index);
+    SIMD_F32 offset_ = SIMD_LD_F32(offset + index);
+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
+    SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_);
+    SIMD_ST_F32(output + index, output_data);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h
new file mode 100644
index 00000000..3f52857c
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_NEON_H_
+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int BCEWithLogitLossNEON(int index, const float *logits, const float *label,
+    const float *weight, const float *pos_weight, int length, bool reduction, float *output,
+    float *reduction_sum) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    SIMD_F32 ones = SIMD_MOV_F32(1.0f);
+    SIMD_F32 middle_output = SIMD_SET0_F32;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index);
+      SIMD_F32 label_tmp = SIMD_LD_F32(label + index);
+      SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index);
+      SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index);
+      SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp);
+      SIMD_F32 max_value = neg_logits_tmp;
+      max_value = SIMD_MIN_F32(max_value, zero);
+      SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value);
+      SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones);
+      SIMD_F32 log_exp_value =
+        SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value))));
+      SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp),
+                                    SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value)));
+      if (reduction) {
+        middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output);
+      } else {
+        SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp));
+      }
+    }
+    if (reduction) {
+      *reduction_sum += SIMD_GET_SUM_F32(middle_output);
+    }
+    return index;
+}
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h
new file mode 100644
index 00000000..afaf0de5
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_NEON_H_
+#define MINDSPORE_NNACL_FP32_BIAS_ADD_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int BiasAddByInnerCoreNEON(int index, const float *input, const float *bias, float *output,
+                                                       int64_t num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(input + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(bias + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
+    SIMD_ST_F32(output + index, vout);
+  }
+  return index;
+}
+
+static inline int BiasAddByBatchCoreNEON(int index, const float *input, const float *bias, float *output1,
+                                                       float *output2, float *output3, float *output4, int64_t num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_LDX4_F32(input_data, input + index, num);
+    SIMD_F32 bias_data = SIMD_LD_F32(bias + index);
+    SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data));
+    SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data));
+    SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data));
+    SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+};
+#endif
+
+#endif  // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h
new file mode 100644
index 00000000..8fe26687
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_NEON_H_
+#define MINDSPORE_NNACL_BASE_CAST_BASE_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int Int32ToFloat32NEON(int index, const int32_t *input, float *output, int number) {
+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 value = SIMD_LD_EPI32(input + index);
+    SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value));
+  }
+  return index;
+}
+
+#ifndef MS_SIMD_NEON
+static inline int Float32ToInt32NEON(int index, const float *input, int32_t *output, int number) {
+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 value = SIMD_LD_F32(input + index);
+    SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value));
+  }
+  return index;
+}
+#endif
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h
new file mode 100644
index 00000000..09f55bbf
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_CDIST_NEON_H_
+#define MINDSPORE_NNACL_FP32_CDIST_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int64_t CdistTwoNormalOptNEON(int64_t index, const float *a, const float *b,
+                                                          float *out, int64_t size) {
+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
+    tmp_vec = SIMD_ABS_F32(tmp_vec);
+    result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec);
+  }
+  *out += SIMD_GET_SUM_F32(result_vec);
+
+  return index;
+}
+
+static inline int64_t CdistPNormalOptNEON(int64_t index, const float *a, const float *b,
+                                                        float *out, int64_t size, float p) {
+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
+  SIMD_F32 p_vec = SIMD_MOV_F32(p);
+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
+    tmp_vec = SIMD_ABS_F32(tmp_vec);
+    tmp_vec = SIMD_POW_F32(tmp_vec, p_vec);
+    result_vec = SIMD_ADD_F32(tmp_vec, result_vec);
+  }
+  *out += SIMD_GET_SUM_F32(result_vec);
+
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h
new file mode 100644
index 00000000..d8a2580a
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h
@@ -0,0 +1,120 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_NEON_H_
+#define MINDSPORE_NNACL_FP32_CUMSUM_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+static inline int64_t CumsumOutputInitWithInputNEON(int64_t index, const float *layer_input,
+  float *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index));
+  }
+  return index;
+}
+
+static inline int64_t CumsumOutputInitWithZeroNEON(int64_t index, float *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f));
+  }
+  return index;
+}
+
+static inline int64_t CumsumNEON(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_val = SIMD_LD_F32(layer_input + index);
+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index);
+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
+    SIMD_ST_F32(layer_output + index, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+static inline int64_t CumsumReverseNEON(int64_t index, const float *layer_input, float *layer_output,
+  float *layer_last_output, int inner_dim) {
+
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1);
+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1);
+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
+    SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+static inline int64_t CumsumIntOutputInitWithInputNEON(int64_t index, const int *layer_input,
+  int *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index));
+  }
+  return index;
+}
+
+static inline int64_t CumsumIntOutputInitWithZeroNEON(int64_t index, int *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f));
+  }
+  return index;
+}
+
+static inline int64_t CumsumIntNEON(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index);
+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index);
+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
+    SIMD_ST_EPI32(layer_output + index, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+static inline int64_t CumsumReverseIntNEON(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1);
+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1);
+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
+    SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h
new file mode 100644
index 00000000..c4ce6594
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h
@@ -0,0 +1,166 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_DIV_NEON_H_
+#define MINDSPORE_NNACL_FP32_DIV_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int ElementOptDivNum0NEON(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivNum1NEON(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivReluNum0NEON(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivReluNum1NEON(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivRelu6Num0NEON(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivRelu6Num1NEON(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivReluNEON(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivRelu6NEON(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h
new file mode 100644
index 00000000..b71db336
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_NEON_H_
+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int DropoutFp32NEON(int index, const float *input, float scale,
+    int length, float *output) {
+    SIMD_F32 scale_value = SIMD_MOV_F32(scale);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value));
+    }
+    return index;
+}
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h
new file mode 100644
index 00000000..a594abd2
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_DIV_NEON_H_
+#define MINDSPORE_NNACL_FP32_DIV_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int64_t ExpFp32NEON(int64_t index, const float *src, float *dst, int num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
+  }
+  return index;
+}
+
+static inline int64_t ExpFp32WithInScaleNEON(int64_t index, const float *src, float *dst, int num, float in_scale) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index);
+  }
+  return index;
+}
+
+static inline int64_t ExpFp32WithOutScaleNEON(int64_t index, const float *src, float *dst, int num, float out_scale) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
+    SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h
new file mode 100644
index 00000000..c467d2d9
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_NEON_H_
+#define MINDSPORE_NNACL_BASE_FILL_BASE_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int FillFp32NEON(int index, float *output, int size, float data) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_MOV_F32(data));
+  }
+  return index;
+}
+
+static inline int FillInt32NEON(int index, int *output, int size, int data) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h
new file mode 100644
index 00000000..0eb6c9d2
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h
@@ -0,0 +1,76 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_NEON_H_
+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int64_t GroupNormFp32NEON(int64_t index, const float *unit_input, float scale, float offset, float mean,
+  float var_sqrt, int unit, float *unit_output) {
+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+  SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt);
+  SIMD_F32 scale_val = SIMD_MOV_F32(scale);
+  SIMD_F32 offset_val = SIMD_MOV_F32(offset);
+  for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(unit_input + index);
+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt);
+    SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val);
+    SIMD_ST_F32(unit_output + index, output);
+  }
+  return index;
+}
+
+static inline int64_t GroupNormReduceSumNEON(int64_t index, const float *in, float *sum, int unit) {
+  if (unit - index >= 4 * BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index));
+    }
+    *sum += SIMD_GET_SUM_F32(tmp);
+  }
+  return index;
+}
+
+static inline int64_t GroupNormReduceVarNEON(int64_t index, const float *in, float mean, float *sum, int unit) {
+  if (unit - index >= 4 * BLOCK_NUM) {
+    SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val);
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input));
+    }
+    *sum += SIMD_GET_SUM_F32(tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h
new file mode 100644
index 00000000..0c528616
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_NEON_H_
+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int LayerNormMeanAndSquareNEON(int index, const float *src, int num, float *mean, float *square_mean) {
+  if (num >= 4 * BLOCK_NUM) {
+    SIMD_F32 sum_val = SIMD_SET0_F32;
+    SIMD_F32 square_sum_val = SIMD_SET0_F32;
+    for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 value = SIMD_LD_F32(src + index);
+      SIMD_F32 square_value = SIMD_MUL_F32(value, value);
+      sum_val = SIMD_ADD_F32(sum_val, value);
+      square_sum_val = SIMD_ADD_F32(square_sum_val, square_value);
+    }
+    *mean += SIMD_GET_SUM_F32(sum_val);
+    *square_mean += SIMD_GET_SUM_F32(square_sum_val);
+  }
+  return index;
+}
+
+static inline int LayerNormGammaAndBetaNEON(int index, float *dst, const float *src, const float *gamma_data,
+  const float *beta_data, int num, const float mean, const float deno) {
+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+  SIMD_F32 deno_val = SIMD_MOV_F32(deno);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 value = SIMD_LD_F32(src + index);
+    SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val);
+    out_value = SIMD_MUL_F32(out_value, deno_val);
+    out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index));
+    SIMD_ST_F32(dst + index, out_value);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h
new file mode 100644
index 00000000..0e12e5a0
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h
@@ -0,0 +1,92 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_NEON_H_
+#define MINDSPORE_NNACL_FP32_MATMUL_F32_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6.
+static inline int64_t GemmIsNotPackNEON(int64_t index, const float *a, const float *b, float *c, const float *bias, int row,
+  int deep, int act_type) {
+  SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f);
+  SIMD_F32 up_threshold = SIMD_MOV_F32(6);
+  SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]);
+  SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]);
+  for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_data = SIMD_LD_F32(a + index);
+    SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16);
+    if (act_type != 0) {
+      dst = SIMD_MAX_F32(dst, down_threshold);
+      if (act_type == 3) {
+        dst = SIMD_MIN_F32(dst, up_threshold);
+      }
+    }
+    SIMD_ST_F32(c + index, dst);
+  }
+
+  return index;
+}
+
+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX)
+static inline int64_t GemmIsNotPackOptimizeCoreNEON(int64_t index, const float *a, const float *b, int k, float *dst) {
+  SIMD_F32 dst1 = SIMD_MOV_F32(0.0f);
+  for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 weight = SIMD_LD_F32(b + index);
+    SIMD_F32 a1 = SIMD_LD_F32(a + index);
+    dst1 = SIMD_FMADD_F32(weight, a1, dst1);
+  }
+  *dst += SIMD_REDUCE_ADD_F32(dst1);
+  return index;
+}
+#endif
+
+static inline int64_t MatVecMulNoPackCoreNEON(int64_t oc_index, const float *a, const float *b, float *c, const float *bias,
+  int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) {
+  for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) {
+    SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index));
+    for (int64_t k = 0; k < depth; ++k) {
+      SIMD_F32 left = SIMD_MOV_F32(a[k]);
+      SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col);
+      out = SIMD_FMADD_F32(left, right, out);
+    }
+    if ((inc_flag & 0x2) != 0 && act_type != 0) {
+      out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f));
+      if (act_type == 0x3) {
+        out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f));
+      }
+    }
+    SIMD_ST_F32(c + oc_index, out);
+  }
+  return oc_index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h
new file mode 100644
index 00000000..33506e0c
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h
@@ -0,0 +1,217 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int ElementMulNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulReluNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulRelu6NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulReluIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulRelu6IntNEON(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6Num0NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6Num1NEON(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6IntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6IntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h
new file mode 100644
index 00000000..ea6acf62
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_POOLING_NEON_H_
+#define MINDSPORE_NNACL_FP32_POOLING_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int AvgPoolingBatchNEON(int ci, const float *src_plane_ptr, int channel,
+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
+    const float *src_c_ptr = src_plane_ptr + ci;
+    float *dst_c_ptr = dst_plane_ptr + ci;
+    SIMD_F32 tmp_avg = SIMD_SET0_F32;
+    int real_count = 0;
+    for (int h = real_win_h_start; h < real_win_h_end; h++) {
+      for (int w = real_win_w_start; w < real_win_w_end; w++) {
+        const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+        tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr));
+        ++real_count;
+      }
+    }
+    tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count));
+    tmp_avg = SIMD_MAX_F32(tmp_avg, min_val);
+    tmp_avg = SIMD_MIN_F32(tmp_avg, max_val);
+    SIMD_ST_F32(dst_c_ptr, tmp_avg);
+  }
+  return ci;
+}
+
+static inline int MaxPoolingBatchNEON(int ci, const float *src_plane_ptr, int channel,
+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
+    const float *src_c_ptr = src_plane_ptr + ci;
+    float *dst_c_ptr = dst_plane_ptr + ci;
+    SIMD_F32 tmp_max = min_val;
+    for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+      for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
+        const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+        tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr));
+      }
+    }
+    tmp_max = SIMD_MIN_F32(tmp_max, max_val);
+    SIMD_ST_F32(dst_c_ptr, tmp_max);
+  }
+  return ci;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h
new file mode 100644
index 00000000..fd8699c7
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_POWER_NEON_H_
+#define MINDSPORE_NNACL_FP32_POWER_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int PowerBroadCastIntExponentNEON(int index, const float *input, int exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    SIMD_F32 result = SIMD_MOV_F32(1.0f);
+    int exp = abs(exponent);
+    while (exp) {
+      if (exp % 2) {
+        result = SIMD_MUL_F32(result, tmp);
+      }
+      tmp = SIMD_MUL_SQUARE_F32(tmp);
+      exp = exp / 2;
+    }
+    SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result));
+  }
+  return index;
+}
+
+static inline int PowerBroadCastFloatExponentNEON(int index, const float *input, float exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    SIMD_F32 result;
+    for (int i = 0; i < BLOCK_NUM; ++i) {
+      SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent);
+    }
+    SIMD_ST_F32(output + index, result);
+  }
+  return index;
+}
+
+static inline int PowerSingleExponentNEON(int index, const float *input, const float *exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    for (int j = 0; j < BLOCK_NUM; ++j) {
+      float cur_exponent = exponent[index + j];
+      float cur_val = SIMD_F32_GETI(tmp_vec, j);
+      if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) {
+        int exp = abs((int)(cur_exponent));
+        float result = 1;
+        while (exp) {
+          if (exp % 2) {
+            result *= cur_val;
+          }
+          cur_val *= cur_val;
+          exp = exp / 2;
+        }
+        output[index + j] = *exponent >= 0 ? result : 1 / result;
+      } else {
+        output[index + j] = powf(cur_val, cur_exponent);
+      }
+    }
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h
new file mode 100644
index 00000000..7f9153f8
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h
@@ -0,0 +1,180 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_NEON_H_
+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int64_t ReduceSumNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceMeanNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size));
+  }
+  return index;
+}
+
+static inline int64_t ReduceMinNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceMaxNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceProdNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(1.0f);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceSumSquareNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceL2NormNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
+    }
+    SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp));
+  }
+  return index;
+}
+
+static inline int64_t IntReduceSumNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMeanNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size));
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMinNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMaxNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h
new file mode 100644
index 00000000..f116d92f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_NEON_H_
+#define MINDSPORE_NNACL_FP32_SOFTMAX_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int64_t SoftmaxNormGetMaxNEON(int64_t index, const float *src, int cur_batch_offset,
+  float *max, int channel) {
+  if (channel >= BLOCK_NUM * BLOCK_NUM) {
+    SIMD_F32 max_val = SIMD_MOV_F32(*max);
+    for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index));
+    }
+    *max = SIMD_GET_MAX_F32(max_val);
+  }
+  return index;
+}
+
+static inline int64_t SoftmaxNormCalcNormNEON(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float max, int channel) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max));
+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
+  }
+  return index;
+}
+
+static inline int64_t SoftmaxLastAxisGetExpSumNEON(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float max, float *exp_sum, int channel) {
+#ifndef _WIN32
+  SIMD_F32 sum_val = SIMD_SET0_F32;
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
+    SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max));
+    SIMD_F32 exp_out = SIMD_EXP_F32(output);
+    sum_val = SIMD_ADD_F32(sum_val, exp_out);
+    SIMD_ST_F32(dst + cur_batch_offset + index, exp_out);
+  }
+  *exp_sum += SIMD_GET_SUM_F32(sum_val);
+#endif
+  return index;
+}
+
+static inline int64_t SoftmaxLastAxisGetResultNEON(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float exp_sum, int channel) {
+  SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum);
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
+    SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val);
+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h
new file mode 100644
index 00000000..d2731101
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h
@@ -0,0 +1,166 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_SUB_NEON_H_
+#define MINDSPORE_NNACL_FP32_SUB_NEON_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_NEON
+
+static inline int ElementOptSubNum0NEON(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubNum1NEON(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubReluNum0NEON(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubReluNum1NEON(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubRelu6Num0NEON(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubRelu6Num1NEON(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubNEON(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubReluNEON(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubRelu6NEON(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+
+#undef MS_SIMD_NEON
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h
new file mode 100644
index 00000000..75bda800
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_POOLING_FP32_SIMD_H_
+#define MINDSPORE_NNACL_POOLING_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/pooling_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/pooling_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/pooling_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/pooling_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h
new file mode 100644
index 00000000..15e9f009
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_POWER_FP32_SIMD_H_
+#define MINDSPORE_NNACL_POWER_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/power_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/power_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/power_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/power_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h
new file mode 100644
index 00000000..60d0cd85
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_REDUCE_FP32_SIMD_H_
+#define MINDSPORE_NNACL_REDUCE_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/reduce_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/reduce_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/reduce_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/reduce_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h
new file mode 100644
index 00000000..524668ab
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_SOFTMAX_FP32_SIMD_H_
+#define MINDSPORE_NNACL_SOFTMAX_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/softmax_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/softmax_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/softmax_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/softmax_fp32_neon.h"
+#endif
+
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h
new file mode 100644
index 00000000..192fc66d
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h
@@ -0,0 +1,221 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int Fp32ReluSSE(int index, const float *src, int length, float *dst) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero));
+    }
+    return index;
+}
+
+static inline int Int32ReluSSE(int index, const int32_t *src, int length, int32_t *dst) {
+    SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero));
+    }
+    return index;
+}
+
+static inline int Fp32Relu6SSE(int index, const float *src, int length, float *dst) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    SIMD_F32 six = SIMD_MOV_F32(6.0f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six));
+    }
+    return index;
+}
+
+static inline int LReluSSE(int index, const float *src, int length, float *dst, float alpha) {
+    SIMD_F32 alpha_data = SIMD_MOV_F32(alpha);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask));
+    }
+    return index;
+}
+
+static inline int SigmoidSSE(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index);
+        SIMD_ST_F32(dst + index,
+                    SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
+    }
+    return index;
+}
+
+static inline int TanhSSE(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 input = SIMD_LD_F32(src + index);
+        SIMD_ST_F32(dst + index, SIMD_TANH_F32(input));
+    }
+    return index;
+}
+
+static inline int SwishSSE(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index);
+        SIMD_ST_F32(dst + index,
+                    SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
+    }
+    return index;
+}
+
+static inline int HSwishSSE(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6));
+    }
+    return index;
+}
+
+static inline int HSigmoidSSE(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6));
+    }
+    return index;
+}
+
+static inline int HardTanhNoLimitMinSSE(int index, const float *src, int length, float *dst, float min_val,
+                                            float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val));
+    }
+    return index;
+}
+
+static inline int HardTanhNoLimitMaxSSE(int index, const float *src, int length, float *dst, float min_val,
+                                            float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val));
+    }
+    return index;
+}
+
+static inline int HardTanhLimitMinMaxSSE(int index, const float *src, int length, float *dst, float min_val,
+                                             float max_val) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val));
+    }
+    return index;
+}
+
+static inline int GeluApproximateSSE(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 in = SIMD_LD_F32(src + index);
+        SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in);
+        SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in);
+        SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f)));
+    }
+    return index;
+}
+
+static inline int GeluSSE(int index, const float *src, int length, float *dst) {
+    SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f);
+    SIMD_F32 para2 = SIMD_MOV_F32(1.0f);
+    SIMD_F32 para3 = SIMD_MOV_F32(0.5f);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 in = SIMD_LD_F32(src + index);
+      SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1))));
+      SIMD_ST_F32(dst + index, res);
+    }
+    return index;
+}
+
+static inline int EluSSE(int index, const float *src, int length, float *dst, float alpha) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f);
+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
+    }
+    return index;
+}
+
+static inline int CeluSSE(int index, const float *src, int length, float *dst, float alpha) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f);
+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
+    }
+    return index;
+}
+
+static inline int HShrinkSSE(int index, const float *src, int length, float *dst, float lambd) {
+    const float neg_lambd = -1 * lambd;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd));
+        SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp);
+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask));
+    }
+    return index;
+}
+
+static inline int SoftShrinkSSE(int index, const float *src, int length, float *dst, float lambd) {
+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
+
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_t = SIMD_LD_F32(src + index);
+        /* v0 = (in > lamdb) & (in - lamdb) */
+        SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v));
+        /* v1 = (in < -lamdb) & (in + lamdb) */
+        SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v));
+        /* out = (v0 | v1) */
+        SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1));
+    }
+    return index;
+}
+
+static inline int SoftsignFp32OptSSE(int index, const float *src, int length, float *dst) {
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
+        SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp));
+        SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp));
+    }
+    return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h
new file mode 100644
index 00000000..85996f69
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_SSE_H_
+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int ShrinkGradSSE(int index, const float *src0, const float *src1,
+                                               int length, float *dst, float lambd) {
+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
+
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_F32 src0_t = SIMD_LD_F32(src0 + index);
+        SIMD_F32 src1_t = SIMD_LD_F32(src1 + index);
+
+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v);
+        SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t);
+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
+
+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask));
+    }
+    return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h
new file mode 100644
index 00000000..1f5291a4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h
@@ -0,0 +1,210 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_SSE_H_
+#define MINDSPORE_NNACL_FP32_ADAM_FP32_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+#ifdef MS_SIMD_AVX512
+  static inline size_t AdamWeightDecayFp32SSE(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    const float *gradient, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient + index);
+
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_F32(var + index, var_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp32Fp16SSE(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
+
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(var + index, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp32Fp32SSE(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
+
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(var + index, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp16Fp16SSE(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
+  }
+
+  return index;
+}
+
+static inline size_t FusedCastAdamFp16Fp32SSE(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
+    float global_norm_reciprocal, size_t end) {
+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
+
+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
+    m_r = SIMD_MUL_F32(m_r, beta1_r);
+    v_r = SIMD_MUL_F32(v_r, beta2_r);
+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
+    avx_r0 = SIMD_SQRT_F32(v_r);
+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
+    SIMD_ST_F32(m + index, m_r);
+    SIMD_ST_F32(v + index, v_r);
+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
+  }
+
+  return index;
+}
+#endif
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h
new file mode 100644
index 00000000..eb705534
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h
@@ -0,0 +1,124 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_ADD_SSE_H_
+#define MINDSPORE_NNACL_FP32_ADD_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int ElementOptAddSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddIntSSE(int index, const int *in0, const int *in1, int *out,
+                                                     int size) {
+  SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddReluSSE(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptAddRelu6SSE(int index, const float *in0, const float *in1, float *out,
+                                                       int size) {
+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddReluSSE(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddRelu6SSE(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementAddIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h
new file mode 100644
index 00000000..173890b4
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h
@@ -0,0 +1,254 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_ARITHMETIC_SSE_H_
+#define MINDSPORE_NNACL_ARITHMETIC_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+#ifndef MS_SIMD_NEON
+static inline int ElementFloorModSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorModNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorModNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementFloorDivSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, floor_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+#endif
+
+static inline int ElementFloorDivIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptFloorDivIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMaximumSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMaximumIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMaximumIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMinimumIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
+    SIMD_ST_EPI32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementMinimumSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+static inline int ElementOptMinimumNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
+    SIMD_ST_F32(out + index, out_tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h
new file mode 100644
index 00000000..0a1d21c2
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h
@@ -0,0 +1,129 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_SSE_H_
+#define MINDSPORE_NNACL_ARITHMETIC_SELF_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+#if defined(MS_SIMD_AVX512)
+// only avx512 support abs fp32 instruction
+static inline int ElementAbsSSE(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+static inline int ElementAbsIntSSE(int index, const int *input, int *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index)));
+  }
+  return index;
+}
+#endif
+
+static inline int ElementSquareSSE(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin = SIMD_LD_F32(input + index);
+    SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin));
+  }
+  return index;
+}
+
+static inline int ElementSqrtSSE(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+static inline int ElementRsqrtSSE(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE)
+// avx512 dont support round fp32 instruction
+static inline int ElementRoundSSE(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+#ifndef MS_SIMD_NEON
+// neon dont support floor fp32 instruction
+static inline int ElementFloorSSE(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+#ifndef MS_SIMD_NEON
+static inline int ElementCeilSSE(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+#endif
+
+static inline int ElementNegativeSSE(int index, const float *input, float *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f));
+  }
+  return index;
+}
+
+static inline int ElementNegativeIntSSE(int index, const int *input, int *output, const int element_size) {
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1));
+  }
+  return index;
+}
+
+static inline int ElementReciprocalSSE(int index, const float *input, float *output, const int element_size) {
+  SIMD_F32 num1 = SIMD_MOV_F32(1.0f);
+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index)));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h
new file mode 100644
index 00000000..f04b4e1f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int BatchNormFp32SSE(int index, const float *input, const float *mean,
+  const float *variance, int channel, float epsilon, float *output) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
+    SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
+    SIMD_ST_F32(output + index, output_data);
+  }
+  return index;
+}
+
+static inline int FusedBatchNormFp32SSE(int index, const float *input, const float *scale,
+  const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
+    SIMD_F32 scale_ = SIMD_LD_F32(scale + index);
+    SIMD_F32 offset_ = SIMD_LD_F32(offset + index);
+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
+    SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_);
+    SIMD_ST_F32(output + index, output_data);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h
new file mode 100644
index 00000000..c929ccaf
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_SSE_H_
+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int BCEWithLogitLossSSE(int index, const float *logits, const float *label,
+    const float *weight, const float *pos_weight, int length, bool reduction, float *output,
+    float *reduction_sum) {
+    SIMD_F32 zero = SIMD_SET0_F32;
+    SIMD_F32 ones = SIMD_MOV_F32(1.0f);
+    SIMD_F32 middle_output = SIMD_SET0_F32;
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index);
+      SIMD_F32 label_tmp = SIMD_LD_F32(label + index);
+      SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index);
+      SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index);
+      SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp);
+      SIMD_F32 max_value = neg_logits_tmp;
+      max_value = SIMD_MIN_F32(max_value, zero);
+      SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value);
+      SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones);
+      SIMD_F32 log_exp_value =
+        SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value))));
+      SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp),
+                                    SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value)));
+      if (reduction) {
+        middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output);
+      } else {
+        SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp));
+      }
+    }
+    if (reduction) {
+      *reduction_sum += SIMD_GET_SUM_F32(middle_output);
+    }
+    return index;
+}
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h
new file mode 100644
index 00000000..0544d239
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_SSE_H_
+#define MINDSPORE_NNACL_FP32_BIAS_ADD_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int BiasAddByInnerCoreSSE(int index, const float *input, const float *bias, float *output,
+                                                       int64_t num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(input + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(bias + index);
+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
+    SIMD_ST_F32(output + index, vout);
+  }
+  return index;
+}
+
+static inline int BiasAddByBatchCoreSSE(int index, const float *input, const float *bias, float *output1,
+                                                       float *output2, float *output3, float *output4, int64_t num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_LDX4_F32(input_data, input + index, num);
+    SIMD_F32 bias_data = SIMD_LD_F32(bias + index);
+    SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data));
+    SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data));
+    SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data));
+    SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+};
+#endif
+
+#endif  // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h
new file mode 100644
index 00000000..4eca209f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_SSE_H_
+#define MINDSPORE_NNACL_BASE_CAST_BASE_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int Int32ToFloat32SSE(int index, const int32_t *input, float *output, int number) {
+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 value = SIMD_LD_EPI32(input + index);
+    SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value));
+  }
+  return index;
+}
+
+#ifndef MS_SIMD_NEON
+static inline int Float32ToInt32SSE(int index, const float *input, int32_t *output, int number) {
+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 value = SIMD_LD_F32(input + index);
+    SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value));
+  }
+  return index;
+}
+#endif
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h
new file mode 100644
index 00000000..3d116113
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_CDIST_SSE_H_
+#define MINDSPORE_NNACL_FP32_CDIST_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int64_t CdistTwoNormalOptSSE(int64_t index, const float *a, const float *b,
+                                                          float *out, int64_t size) {
+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
+    tmp_vec = SIMD_ABS_F32(tmp_vec);
+    result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec);
+  }
+  *out += SIMD_GET_SUM_F32(result_vec);
+
+  return index;
+}
+
+static inline int64_t CdistPNormalOptSSE(int64_t index, const float *a, const float *b,
+                                                        float *out, int64_t size, float p) {
+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
+  SIMD_F32 p_vec = SIMD_MOV_F32(p);
+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
+    tmp_vec = SIMD_ABS_F32(tmp_vec);
+    tmp_vec = SIMD_POW_F32(tmp_vec, p_vec);
+    result_vec = SIMD_ADD_F32(tmp_vec, result_vec);
+  }
+  *out += SIMD_GET_SUM_F32(result_vec);
+
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h
new file mode 100644
index 00000000..1b67143f
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h
@@ -0,0 +1,121 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_SSE_H_
+#define MINDSPORE_NNACL_FP32_CUMSUM_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+static inline int64_t CumsumOutputInitWithInputSSE(int64_t index, const float *layer_input,
+  float *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index));
+  }
+  return index;
+}
+
+static inline int64_t CumsumOutputInitWithZeroSSE(int64_t index, float *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f));
+  }
+  return index;
+}
+
+static inline int64_t CumsumSSE(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_val = SIMD_LD_F32(layer_input + index);
+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index);
+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
+    SIMD_ST_F32(layer_output + index, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+static inline int64_t CumsumReverseSSE(int64_t index, const float *layer_input, float *layer_output,
+  float *layer_last_output, int inner_dim) {
+
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1);
+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1);
+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
+    SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
+// (a, b, c) -> (0, a,   a+b)    exclusive == true
+static inline int64_t CumsumIntOutputInitWithInputSSE(int64_t index, const int *layer_input,
+  int *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index));
+  }
+  return index;
+}
+
+static inline int64_t CumsumIntOutputInitWithZeroSSE(int64_t index, int *layer_output, int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f));
+  }
+  return index;
+}
+
+static inline int64_t CumsumIntSSE(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index);
+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index);
+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
+    SIMD_ST_EPI32(layer_output + index, out_val);
+  }
+  return index;
+}
+
+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
+// (a, b, c) -> (c+b, c, 0) exclusive==true
+static inline int64_t CumsumReverseIntSSE(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
+  int inner_dim) {
+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1);
+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1);
+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
+    SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h
new file mode 100644
index 00000000..5f0c6009
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_DIV_SSE_H_
+#define MINDSPORE_NNACL_FP32_DIV_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int ElementOptDivNum0SSE(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivNum1SSE(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivReluNum0SSE(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivReluNum1SSE(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivRelu6Num0SSE(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptDivRelu6Num1SSE(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivReluSSE(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementDivRelu6SSE(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h
new file mode 100644
index 00000000..2429ed38
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_SSE_H_
+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int DropoutFp32SSE(int index, const float *input, float scale,
+    int length, float *output) {
+    SIMD_F32 scale_value = SIMD_MOV_F32(scale);
+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+        SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value));
+    }
+    return index;
+}
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h
new file mode 100644
index 00000000..3d802fb3
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_DIV_SSE_H_
+#define MINDSPORE_NNACL_FP32_DIV_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int64_t ExpFp32SSE(int64_t index, const float *src, float *dst, int num) {
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
+  }
+  return index;
+}
+
+static inline int64_t ExpFp32WithInScaleSSE(int64_t index, const float *src, float *dst, int num, float in_scale) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index);
+  }
+  return index;
+}
+
+static inline int64_t ExpFp32WithOutScaleSSE(int64_t index, const float *src, float *dst, int num, float out_scale) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
+    SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h
new file mode 100644
index 00000000..9c71eefb
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_SSE_H_
+#define MINDSPORE_NNACL_BASE_FILL_BASE_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int FillFp32SSE(int index, float *output, int size, float data) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_F32(output + index, SIMD_MOV_F32(data));
+  }
+  return index;
+}
+
+static inline int FillInt32SSE(int index, int *output, int size, int data) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data));
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h
new file mode 100644
index 00000000..1c1f57da
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_SSE_H_
+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int64_t GroupNormFp32SSE(int64_t index, const float *unit_input, float scale, float offset, float mean,
+  float var_sqrt, int unit, float *unit_output) {
+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+  SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt);
+  SIMD_F32 scale_val = SIMD_MOV_F32(scale);
+  SIMD_F32 offset_val = SIMD_MOV_F32(offset);
+  for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(unit_input + index);
+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt);
+    SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val);
+    SIMD_ST_F32(unit_output + index, output);
+  }
+  return index;
+}
+
+static inline int64_t GroupNormReduceSumSSE(int64_t index, const float *in, float *sum, int unit) {
+  if (unit - index >= 4 * BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index));
+    }
+    *sum += SIMD_GET_SUM_F32(tmp);
+  }
+  return index;
+}
+
+static inline int64_t GroupNormReduceVarSSE(int64_t index, const float *in, float mean, float *sum, int unit) {
+  if (unit - index >= 4 * BLOCK_NUM) {
+    SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val);
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input));
+    }
+    *sum += SIMD_GET_SUM_F32(tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h
new file mode 100644
index 00000000..30af87c3
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_SSE_H_
+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int LayerNormMeanAndSquareSSE(int index, const float *src, int num, float *mean, float *square_mean) {
+  if (num >= 4 * BLOCK_NUM) {
+    SIMD_F32 sum_val = SIMD_SET0_F32;
+    SIMD_F32 square_sum_val = SIMD_SET0_F32;
+    for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      SIMD_F32 value = SIMD_LD_F32(src + index);
+      SIMD_F32 square_value = SIMD_MUL_F32(value, value);
+      sum_val = SIMD_ADD_F32(sum_val, value);
+      square_sum_val = SIMD_ADD_F32(square_sum_val, square_value);
+    }
+    *mean += SIMD_GET_SUM_F32(sum_val);
+    *square_mean += SIMD_GET_SUM_F32(square_sum_val);
+  }
+  return index;
+}
+
+static inline int LayerNormGammaAndBetaSSE(int index, float *dst, const float *src, const float *gamma_data,
+  const float *beta_data, int num, const float mean, const float deno) {
+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
+  SIMD_F32 deno_val = SIMD_MOV_F32(deno);
+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 value = SIMD_LD_F32(src + index);
+    SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val);
+    out_value = SIMD_MUL_F32(out_value, deno_val);
+    out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index));
+    SIMD_ST_F32(dst + index, out_value);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h
new file mode 100644
index 00000000..aef5b2a1
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h
@@ -0,0 +1,93 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_SSE_H_
+#define MINDSPORE_NNACL_FP32_MATMUL_F32_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6.
+static inline int64_t GemmIsNotPackSSE(int64_t index, const float *a, const float *b, float *c, const float *bias, int row,
+  int deep, int act_type) {
+  SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f);
+  SIMD_F32 up_threshold = SIMD_MOV_F32(6);
+  SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]);
+  SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]);
+  for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 a_data = SIMD_LD_F32(a + index);
+    SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16);
+    if (act_type != 0) {
+      dst = SIMD_MAX_F32(dst, down_threshold);
+      if (act_type == 3) {
+        dst = SIMD_MIN_F32(dst, up_threshold);
+      }
+    }
+    SIMD_ST_F32(c + index, dst);
+  }
+
+  return index;
+}
+
+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX)
+static inline int64_t GemmIsNotPackOptimizeCoreSSE(int64_t index, const float *a, const float *b, int k, float *dst) {
+  SIMD_F32 dst1 = SIMD_MOV_F32(0.0f);
+  for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 weight = SIMD_LD_F32(b + index);
+    SIMD_F32 a1 = SIMD_LD_F32(a + index);
+    dst1 = SIMD_FMADD_F32(weight, a1, dst1);
+  }
+  *dst += SIMD_REDUCE_ADD_F32(dst1);
+  return index;
+}
+#endif
+
+static inline int64_t MatVecMulNoPackCoreSSE(int64_t oc_index, const float *a, const float *b, float *c, const float *bias,
+  int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) {
+  for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) {
+    SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index));
+    for (int64_t k = 0; k < depth; ++k) {
+      SIMD_F32 left = SIMD_MOV_F32(a[k]);
+      SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col);
+      out = SIMD_FMADD_F32(left, right, out);
+    }
+    if ((inc_flag & 0x2) != 0 && act_type != 0) {
+      out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f));
+      if (act_type == 0x3) {
+        out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f));
+      }
+    }
+    SIMD_ST_F32(c + oc_index, out);
+  }
+  return oc_index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h
new file mode 100644
index 00000000..e3dd4582
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h
@@ -0,0 +1,218 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
+#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int ElementMulSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulReluSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulRelu6SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulReluIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementMulRelu6IntSSE(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6Num0SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6Num1SSE(int index, const float *in0, const float *in1, float *out, int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulReluIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6IntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptMulRelu6IntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h
new file mode 100644
index 00000000..ad9239fd
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_POOLING_SSE_H_
+#define MINDSPORE_NNACL_FP32_POOLING_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int AvgPoolingBatchSSE(int ci, const float *src_plane_ptr, int channel,
+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
+    const float *src_c_ptr = src_plane_ptr + ci;
+    float *dst_c_ptr = dst_plane_ptr + ci;
+    SIMD_F32 tmp_avg = SIMD_SET0_F32;
+    int real_count = 0;
+    for (int h = real_win_h_start; h < real_win_h_end; h++) {
+      for (int w = real_win_w_start; w < real_win_w_end; w++) {
+        const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
+        tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr));
+        ++real_count;
+      }
+    }
+    tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count));
+    tmp_avg = SIMD_MAX_F32(tmp_avg, min_val);
+    tmp_avg = SIMD_MIN_F32(tmp_avg, max_val);
+    SIMD_ST_F32(dst_c_ptr, tmp_avg);
+  }
+  return ci;
+}
+
+static inline int MaxPoolingBatchSSE(int ci, const float *src_plane_ptr, int channel,
+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
+    const float *src_c_ptr = src_plane_ptr + ci;
+    float *dst_c_ptr = dst_plane_ptr + ci;
+    SIMD_F32 tmp_max = min_val;
+    for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
+      for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
+        const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
+        tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr));
+      }
+    }
+    tmp_max = SIMD_MIN_F32(tmp_max, max_val);
+    SIMD_ST_F32(dst_c_ptr, tmp_max);
+  }
+  return ci;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h
new file mode 100644
index 00000000..4c46310e
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_POWER_SSE_H_
+#define MINDSPORE_NNACL_FP32_POWER_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int PowerBroadCastIntExponentSSE(int index, const float *input, int exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    SIMD_F32 result = SIMD_MOV_F32(1.0f);
+    int exp = abs(exponent);
+    while (exp) {
+      if (exp % 2) {
+        result = SIMD_MUL_F32(result, tmp);
+      }
+      tmp = SIMD_MUL_SQUARE_F32(tmp);
+      exp = exp / 2;
+    }
+    SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result));
+  }
+  return index;
+}
+
+static inline int PowerBroadCastFloatExponentSSE(int index, const float *input, float exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    SIMD_F32 result;
+    for (int i = 0; i < BLOCK_NUM; ++i) {
+      SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent);
+    }
+    SIMD_ST_F32(output + index, result);
+  }
+  return index;
+}
+
+static inline int PowerSingleExponentSSE(int index, const float *input, const float *exponent, float *output, int len,
+  float scale, float shift) {
+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
+    for (int j = 0; j < BLOCK_NUM; ++j) {
+      float cur_exponent = exponent[index + j];
+      float cur_val = SIMD_F32_GETI(tmp_vec, j);
+      if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) {
+        int exp = abs((int)(cur_exponent));
+        float result = 1;
+        while (exp) {
+          if (exp % 2) {
+            result *= cur_val;
+          }
+          cur_val *= cur_val;
+          exp = exp / 2;
+        }
+        output[index + j] = *exponent >= 0 ? result : 1 / result;
+      } else {
+        output[index + j] = powf(cur_val, cur_exponent);
+      }
+    }
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h
new file mode 100644
index 00000000..936a5d51
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h
@@ -0,0 +1,181 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_SSE_H_
+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int64_t ReduceSumSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceMeanSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size));
+  }
+  return index;
+}
+
+static inline int64_t ReduceMinSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceMaxSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceProdSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(1.0f);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceSumSquareSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
+    }
+    SIMD_ST_F32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t ReduceL2NormSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const float *inner_src = outer_src + index;
+    SIMD_F32 tmp = SIMD_MOV_F32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
+    }
+    SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp));
+  }
+  return index;
+}
+
+static inline int64_t IntReduceSumSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMeanSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size));
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMinSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+static inline int64_t IntReduceMaxSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
+  int axis_size) {
+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    const int *inner_src = outer_src + index;
+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN);
+    for (int i = 0; i < axis_size; i++) {
+      tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
+    }
+    SIMD_ST_EPI32(outer_dst + index, tmp);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h
new file mode 100644
index 00000000..71c89ebc
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_SSE_H_
+#define MINDSPORE_NNACL_FP32_SOFTMAX_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int64_t SoftmaxNormGetMaxSSE(int64_t index, const float *src, int cur_batch_offset,
+  float *max, int channel) {
+  if (channel >= BLOCK_NUM * BLOCK_NUM) {
+    SIMD_F32 max_val = SIMD_MOV_F32(*max);
+    for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+      max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index));
+    }
+    *max = SIMD_GET_MAX_F32(max_val);
+  }
+  return index;
+}
+
+static inline int64_t SoftmaxNormCalcNormSSE(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float max, int channel) {
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max));
+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
+  }
+  return index;
+}
+
+static inline int64_t SoftmaxLastAxisGetExpSumSSE(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float max, float *exp_sum, int channel) {
+#ifndef _WIN32
+  SIMD_F32 sum_val = SIMD_SET0_F32;
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
+    SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max));
+    SIMD_F32 exp_out = SIMD_EXP_F32(output);
+    sum_val = SIMD_ADD_F32(sum_val, exp_out);
+    SIMD_ST_F32(dst + cur_batch_offset + index, exp_out);
+  }
+  *exp_sum += SIMD_GET_SUM_F32(sum_val);
+#endif
+  return index;
+}
+
+static inline int64_t SoftmaxLastAxisGetResultSSE(int64_t index, const float *src, float *dst,
+  int cur_batch_offset, float exp_sum, int channel) {
+  SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum);
+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
+    SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val);
+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h
new file mode 100644
index 00000000..a6197e19
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_NNACL_FP32_SUB_SSE_H_
+#define MINDSPORE_NNACL_FP32_SUB_SSE_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
+#define BLOCK_NUM 4
+#define MS_SIMD_SSE
+
+static inline int ElementOptSubNum0SSE(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubNum1SSE(int index, const float *in0, const float *in1, float *out,
+                                                      int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubReluNum0SSE(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubReluNum1SSE(int index, const float *in0, const float *in1, float *out,
+                                                          int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubRelu6Num0SSE(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementOptSubRelu6Num1SSE(int index, const float *in0, const float *in1, float *out,
+                                                           int size) {
+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubSSE(int index, const float *in0, const float *in1, float *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1);
+    SIMD_ST_EPI32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubReluSSE(int index, const float *in0, const float *in1, float *out,
+                                                   int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+static inline int ElementSubRelu6SSE(int index, const float *in0, const float *in1, float *out,
+                                                    int size) {
+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f);
+    SIMD_ST_F32(out + index, vout);
+  }
+  return index;
+}
+
+#undef MS_SIMD_INSTRUCTION
+#undef BLOCK_NUM
+#pragma GCC pop_options
+#undef MS_SIMD_SSE
+#ifdef __cplusplus
+};
+#endif
+#endif
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h
new file mode 100644
index 00000000..894f5d7c
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_NNACL_SUB_FP32_SIMD_H_
+#define MINDSPORE_NNACL_SUB_FP32_SIMD_H_
+
+#include "nnacl/intrinsics/ms_simd_instructions.h"
+#ifdef ENABLE_AVX512
+#include "nnacl/avx512/sub_fp32_avx512.h"
+#endif
+
+#ifdef ENABLE_AVX
+#include "nnacl/avx/sub_fp32_avx.h"
+#endif
+
+#ifdef ENABLE_SSE
+#include "nnacl/sse/sub_fp32_sse.h"
+#endif
+
+#ifdef ENABLE_ARM
+#include "nnacl/neon/sub_fp32_neon.h"
+#endif
+
+#endif
--
2.34.1