• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1From e6e35ad9f7b4c0c99d2f9b62c7d199dd3bf487dc Mon Sep 17 00:00:00 2001
2From: Zhu Guodong <zhuguodong0001@163.com>
3Date: Mon, 6 Mar 2023 16:02:57 +0800
4Subject: [PATCH 2/4] generate nnacl simd headers manually
5
6---
7 .../include/nnacl/activation_fp32_simd.h      |  36 +++
8 .../include/nnacl/activation_grad_simd.h      |  36 +++
9 .../nnacl/include/nnacl/adam_fp32_simd.h      |  36 +++
10 .../nnacl/include/nnacl/add_fp32_simd.h       |  36 +++
11 .../include/nnacl/arithmetic_fp32_simd.h      |  36 +++
12 .../include/nnacl/arithmetic_self_fp32_simd.h |  36 +++
13 .../include/nnacl/avx/activation_fp32_avx.h   | 221 +++++++++++++++
14 .../include/nnacl/avx/activation_grad_avx.h   |  57 ++++
15 .../nnacl/include/nnacl/avx/adam_fp32_avx.h   | 210 +++++++++++++++
16 .../nnacl/include/nnacl/avx/add_fp32_avx.h    | 124 +++++++++
17 .../include/nnacl/avx/arithmetic_fp32_avx.h   | 254 ++++++++++++++++++
18 .../nnacl/avx/arithmetic_self_fp32_avx.h      | 129 +++++++++
19 .../include/nnacl/avx/batchnorm_fp32_avx.h    |  67 +++++
20 .../nnacl/avx/bce_with_logits_loss_fp32_avx.h |  69 +++++
21 .../nnacl/include/nnacl/avx/bias_add_avx.h    |  64 +++++
22 .../nnacl/include/nnacl/avx/cast_base_avx.h   |  56 ++++
23 .../nnacl/include/nnacl/avx/cdist_fp32_avx.h  |  70 +++++
24 .../nnacl/include/nnacl/avx/cumsum_fp32_avx.h | 121 +++++++++
25 .../nnacl/include/nnacl/avx/div_fp32_avx.h    | 167 ++++++++++++
26 .../include/nnacl/avx/dropout_fp32_avx.h      |  46 ++++
27 .../nnacl/include/nnacl/avx/exp_fp32_avx.h    |  63 +++++
28 .../nnacl/include/nnacl/avx/fill_base_avx.h   |  53 ++++
29 .../include/nnacl/avx/group_norm_fp32_avx.h   |  77 ++++++
30 .../include/nnacl/avx/layer_norm_fp32_avx.h   |  68 +++++
31 .../nnacl/include/nnacl/avx/matmul_fp32_avx.h |  93 +++++++
32 .../nnacl/include/nnacl/avx/mul_fp32_avx.h    | 218 +++++++++++++++
33 .../include/nnacl/avx/pooling_fp32_avx.h      |  84 ++++++
34 .../nnacl/include/nnacl/avx/power_fp32_avx.h  | 101 +++++++
35 .../nnacl/include/nnacl/avx/reduce_fp32_avx.h | 181 +++++++++++++
36 .../include/nnacl/avx/softmax_fp32_avx.h      |  87 ++++++
37 .../nnacl/include/nnacl/avx/sub_fp32_avx.h    | 167 ++++++++++++
38 .../nnacl/avx512/activation_fp32_avx512.h     | 221 +++++++++++++++
39 .../nnacl/avx512/activation_grad_avx512.h     |  57 ++++
40 .../include/nnacl/avx512/adam_fp32_avx512.h   | 210 +++++++++++++++
41 .../include/nnacl/avx512/add_fp32_avx512.h    | 124 +++++++++
42 .../nnacl/avx512/arithmetic_fp32_avx512.h     | 254 ++++++++++++++++++
43 .../avx512/arithmetic_self_fp32_avx512.h      | 129 +++++++++
44 .../nnacl/avx512/batchnorm_fp32_avx512.h      |  67 +++++
45 .../avx512/bce_with_logits_loss_fp32_avx512.h |  69 +++++
46 .../include/nnacl/avx512/bias_add_avx512.h    |  64 +++++
47 .../include/nnacl/avx512/cast_base_avx512.h   |  56 ++++
48 .../include/nnacl/avx512/cdist_fp32_avx512.h  |  70 +++++
49 .../include/nnacl/avx512/cumsum_fp32_avx512.h | 121 +++++++++
50 .../include/nnacl/avx512/div_fp32_avx512.h    | 167 ++++++++++++
51 .../nnacl/avx512/dropout_fp32_avx512.h        |  46 ++++
52 .../include/nnacl/avx512/exp_fp32_avx512.h    |  63 +++++
53 .../include/nnacl/avx512/fill_base_avx512.h   |  53 ++++
54 .../nnacl/avx512/group_norm_fp32_avx512.h     |  77 ++++++
55 .../nnacl/avx512/layer_norm_fp32_avx512.h     |  68 +++++
56 .../include/nnacl/avx512/matmul_fp32_avx512.h |  93 +++++++
57 .../include/nnacl/avx512/mul_fp32_avx512.h    | 218 +++++++++++++++
58 .../nnacl/avx512/pooling_fp32_avx512.h        |  84 ++++++
59 .../include/nnacl/avx512/power_fp32_avx512.h  | 101 +++++++
60 .../include/nnacl/avx512/reduce_fp32_avx512.h | 181 +++++++++++++
61 .../nnacl/avx512/softmax_fp32_avx512.h        |  87 ++++++
62 .../include/nnacl/avx512/sub_fp32_avx512.h    | 167 ++++++++++++
63 .../nnacl/include/nnacl/batchnorm_fp32_simd.h |  36 +++
64 .../nnacl/bce_with_logits_loss_fp32_simd.h    |  36 +++
65 .../nnacl/include/nnacl/bias_add_simd.h       |  36 +++
66 .../nnacl/include/nnacl/cast_base_simd.h      |  36 +++
67 .../nnacl/include/nnacl/cdist_fp32_simd.h     |  36 +++
68 .../nnacl/include/nnacl/cumsum_fp32_simd.h    |  36 +++
69 .../nnacl/include/nnacl/div_fp32_simd.h       |  36 +++
70 .../nnacl/include/nnacl/dropout_fp32_simd.h   |  36 +++
71 .../nnacl/include/nnacl/exp_fp32_simd.h       |  36 +++
72 .../nnacl/include/nnacl/fill_base_simd.h      |  36 +++
73 .../include/nnacl/group_norm_fp32_simd.h      |  36 +++
74 .../include/nnacl/layer_norm_fp32_simd.h      |  36 +++
75 .../nnacl/include/nnacl/matmul_fp32_simd.h    |  36 +++
76 .../nnacl/include/nnacl/mul_fp32_simd.h       |  36 +++
77 .../include/nnacl/neon/activation_fp32_neon.h | 220 +++++++++++++++
78 .../include/nnacl/neon/activation_grad_neon.h |  56 ++++
79 .../nnacl/include/nnacl/neon/adam_fp32_neon.h | 209 ++++++++++++++
80 .../nnacl/include/nnacl/neon/add_fp32_neon.h  | 123 +++++++++
81 .../include/nnacl/neon/arithmetic_fp32_neon.h | 253 +++++++++++++++++
82 .../nnacl/neon/arithmetic_self_fp32_neon.h    | 128 +++++++++
83 .../include/nnacl/neon/batchnorm_fp32_neon.h  |  66 +++++
84 .../neon/bce_with_logits_loss_fp32_neon.h     |  68 +++++
85 .../nnacl/include/nnacl/neon/bias_add_neon.h  |  63 +++++
86 .../nnacl/include/nnacl/neon/cast_base_neon.h |  55 ++++
87 .../include/nnacl/neon/cdist_fp32_neon.h      |  69 +++++
88 .../include/nnacl/neon/cumsum_fp32_neon.h     | 120 +++++++++
89 .../nnacl/include/nnacl/neon/div_fp32_neon.h  | 166 ++++++++++++
90 .../include/nnacl/neon/dropout_fp32_neon.h    |  45 ++++
91 .../nnacl/include/nnacl/neon/exp_fp32_neon.h  |  62 +++++
92 .../nnacl/include/nnacl/neon/fill_base_neon.h |  52 ++++
93 .../include/nnacl/neon/group_norm_fp32_neon.h |  76 ++++++
94 .../include/nnacl/neon/layer_norm_fp32_neon.h |  67 +++++
95 .../include/nnacl/neon/matmul_fp32_neon.h     |  92 +++++++
96 .../nnacl/include/nnacl/neon/mul_fp32_neon.h  | 217 +++++++++++++++
97 .../include/nnacl/neon/pooling_fp32_neon.h    |  83 ++++++
98 .../include/nnacl/neon/power_fp32_neon.h      | 100 +++++++
99 .../include/nnacl/neon/reduce_fp32_neon.h     | 180 +++++++++++++
100 .../include/nnacl/neon/softmax_fp32_neon.h    |  86 ++++++
101 .../nnacl/include/nnacl/neon/sub_fp32_neon.h  | 166 ++++++++++++
102 .../nnacl/include/nnacl/pooling_fp32_simd.h   |  36 +++
103 .../nnacl/include/nnacl/power_fp32_simd.h     |  36 +++
104 .../nnacl/include/nnacl/reduce_fp32_simd.h    |  36 +++
105 .../nnacl/include/nnacl/softmax_fp32_simd.h   |  36 +++
106 .../include/nnacl/sse/activation_fp32_sse.h   | 221 +++++++++++++++
107 .../include/nnacl/sse/activation_grad_sse.h   |  57 ++++
108 .../nnacl/include/nnacl/sse/adam_fp32_sse.h   | 210 +++++++++++++++
109 .../nnacl/include/nnacl/sse/add_fp32_sse.h    | 124 +++++++++
110 .../include/nnacl/sse/arithmetic_fp32_sse.h   | 254 ++++++++++++++++++
111 .../nnacl/sse/arithmetic_self_fp32_sse.h      | 129 +++++++++
112 .../include/nnacl/sse/batchnorm_fp32_sse.h    |  67 +++++
113 .../nnacl/sse/bce_with_logits_loss_fp32_sse.h |  69 +++++
114 .../nnacl/include/nnacl/sse/bias_add_sse.h    |  64 +++++
115 .../nnacl/include/nnacl/sse/cast_base_sse.h   |  56 ++++
116 .../nnacl/include/nnacl/sse/cdist_fp32_sse.h  |  70 +++++
117 .../nnacl/include/nnacl/sse/cumsum_fp32_sse.h | 121 +++++++++
118 .../nnacl/include/nnacl/sse/div_fp32_sse.h    | 167 ++++++++++++
119 .../include/nnacl/sse/dropout_fp32_sse.h      |  46 ++++
120 .../nnacl/include/nnacl/sse/exp_fp32_sse.h    |  63 +++++
121 .../nnacl/include/nnacl/sse/fill_base_sse.h   |  53 ++++
122 .../include/nnacl/sse/group_norm_fp32_sse.h   |  77 ++++++
123 .../include/nnacl/sse/layer_norm_fp32_sse.h   |  68 +++++
124 .../nnacl/include/nnacl/sse/matmul_fp32_sse.h |  93 +++++++
125 .../nnacl/include/nnacl/sse/mul_fp32_sse.h    | 218 +++++++++++++++
126 .../include/nnacl/sse/pooling_fp32_sse.h      |  84 ++++++
127 .../nnacl/include/nnacl/sse/power_fp32_sse.h  | 101 +++++++
128 .../nnacl/include/nnacl/sse/reduce_fp32_sse.h | 181 +++++++++++++
129 .../include/nnacl/sse/softmax_fp32_sse.h      |  87 ++++++
130 .../nnacl/include/nnacl/sse/sub_fp32_sse.h    | 167 ++++++++++++
131 .../nnacl/include/nnacl/sub_fp32_simd.h       |  36 +++
132 125 files changed, 12263 insertions(+)
133 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h
134 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h
135 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h
136 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h
137 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h
138 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h
139 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h
140 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h
141 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h
142 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h
143 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h
144 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h
145 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h
146 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h
147 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h
148 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h
149 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h
150 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h
151 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h
152 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h
153 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h
154 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h
155 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h
156 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h
157 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h
158 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h
159 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h
160 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h
161 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h
162 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h
163 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h
164 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h
165 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h
166 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h
167 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h
168 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h
169 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h
170 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h
171 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h
172 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h
173 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h
174 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h
175 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h
176 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h
177 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h
178 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h
179 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h
180 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h
181 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h
182 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h
183 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h
184 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h
185 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h
186 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h
187 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h
188 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h
189 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h
190 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h
191 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h
192 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h
193 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h
194 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h
195 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h
196 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h
197 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h
198 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h
199 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h
200 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h
201 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h
202 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h
203 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h
204 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h
205 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h
206 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h
207 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h
208 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h
209 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h
210 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h
211 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h
212 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h
213 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h
214 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h
215 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h
216 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h
217 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h
218 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h
219 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h
220 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h
221 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h
222 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h
223 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h
224 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h
225 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h
226 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h
227 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h
228 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h
229 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h
230 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h
231 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h
232 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h
233 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h
234 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h
235 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h
236 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h
237 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h
238 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h
239 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h
240 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h
241 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h
242 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h
243 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h
244 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h
245 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h
246 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h
247 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h
248 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h
249 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h
250 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h
251 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h
252 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h
253 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h
254 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h
255 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h
256 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h
257 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h
258
259diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h
260new file mode 100644
261index 00000000..fead4fd3
262--- /dev/null
263+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h
264@@ -0,0 +1,36 @@
265+/**
266+ * Copyright 2022 Huawei Technologies Co., Ltd
267+ *
268+ * Licensed under the Apache License, Version 2.0 (the "License");
269+ * you may not use this file except in compliance with the License.
270+ * You may obtain a copy of the License at
271+ *
272+ * http://www.apache.org/licenses/LICENSE-2.0
273+ *
274+ * Unless required by applicable law or agreed to in writing, software
275+ * distributed under the License is distributed on an "AS IS" BASIS,
276+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
277+ * See the License for the specific language governing permissions and
278+ * limitations under the License.
279+ */
280+#ifndef MINDSPORE_NNACL_ACTIVATION_FP32_SIMD_H_
281+#define MINDSPORE_NNACL_ACTIVATION_FP32_SIMD_H_
282+
283+#include "nnacl/intrinsics/ms_simd_instructions.h"
284+#ifdef ENABLE_AVX512
285+#include "nnacl/avx512/activation_fp32_avx512.h"
286+#endif
287+
288+#ifdef ENABLE_AVX
289+#include "nnacl/avx/activation_fp32_avx.h"
290+#endif
291+
292+#ifdef ENABLE_SSE
293+#include "nnacl/sse/activation_fp32_sse.h"
294+#endif
295+
296+#ifdef ENABLE_ARM
297+#include "nnacl/neon/activation_fp32_neon.h"
298+#endif
299+
300+#endif
301diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h
302new file mode 100644
303index 00000000..c8637379
304--- /dev/null
305+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h
306@@ -0,0 +1,36 @@
307+/**
308+ * Copyright 2022 Huawei Technologies Co., Ltd
309+ *
310+ * Licensed under the Apache License, Version 2.0 (the "License");
311+ * you may not use this file except in compliance with the License.
312+ * You may obtain a copy of the License at
313+ *
314+ * http://www.apache.org/licenses/LICENSE-2.0
315+ *
316+ * Unless required by applicable law or agreed to in writing, software
317+ * distributed under the License is distributed on an "AS IS" BASIS,
318+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
319+ * See the License for the specific language governing permissions and
320+ * limitations under the License.
321+ */
322+#ifndef MINDSPORE_NNACL_ACTIVATION_GRAD_SIMD_H_
323+#define MINDSPORE_NNACL_ACTIVATION_GRAD_SIMD_H_
324+
325+#include "nnacl/intrinsics/ms_simd_instructions.h"
326+#ifdef ENABLE_AVX512
327+#include "nnacl/avx512/activation_grad_avx512.h"
328+#endif
329+
330+#ifdef ENABLE_AVX
331+#include "nnacl/avx/activation_grad_avx.h"
332+#endif
333+
334+#ifdef ENABLE_SSE
335+#include "nnacl/sse/activation_grad_sse.h"
336+#endif
337+
338+#ifdef ENABLE_ARM
339+#include "nnacl/neon/activation_grad_neon.h"
340+#endif
341+
342+#endif
343diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h
344new file mode 100644
345index 00000000..267799ed
346--- /dev/null
347+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h
348@@ -0,0 +1,36 @@
349+/**
350+ * Copyright 2022 Huawei Technologies Co., Ltd
351+ *
352+ * Licensed under the Apache License, Version 2.0 (the "License");
353+ * you may not use this file except in compliance with the License.
354+ * You may obtain a copy of the License at
355+ *
356+ * http://www.apache.org/licenses/LICENSE-2.0
357+ *
358+ * Unless required by applicable law or agreed to in writing, software
359+ * distributed under the License is distributed on an "AS IS" BASIS,
360+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
361+ * See the License for the specific language governing permissions and
362+ * limitations under the License.
363+ */
364+#ifndef MINDSPORE_NNACL_ADAM_FP32_SIMD_H_
365+#define MINDSPORE_NNACL_ADAM_FP32_SIMD_H_
366+
367+#include "nnacl/intrinsics/ms_simd_instructions.h"
368+#ifdef ENABLE_AVX512
369+#include "nnacl/avx512/adam_fp32_avx512.h"
370+#endif
371+
372+#ifdef ENABLE_AVX
373+#include "nnacl/avx/adam_fp32_avx.h"
374+#endif
375+
376+#ifdef ENABLE_SSE
377+#include "nnacl/sse/adam_fp32_sse.h"
378+#endif
379+
380+#ifdef ENABLE_ARM
381+#include "nnacl/neon/adam_fp32_neon.h"
382+#endif
383+
384+#endif
385diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h
386new file mode 100644
387index 00000000..83cd76ec
388--- /dev/null
389+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h
390@@ -0,0 +1,36 @@
391+/**
392+ * Copyright 2022 Huawei Technologies Co., Ltd
393+ *
394+ * Licensed under the Apache License, Version 2.0 (the "License");
395+ * you may not use this file except in compliance with the License.
396+ * You may obtain a copy of the License at
397+ *
398+ * http://www.apache.org/licenses/LICENSE-2.0
399+ *
400+ * Unless required by applicable law or agreed to in writing, software
401+ * distributed under the License is distributed on an "AS IS" BASIS,
402+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
403+ * See the License for the specific language governing permissions and
404+ * limitations under the License.
405+ */
406+#ifndef MINDSPORE_NNACL_ADD_FP32_SIMD_H_
407+#define MINDSPORE_NNACL_ADD_FP32_SIMD_H_
408+
409+#include "nnacl/intrinsics/ms_simd_instructions.h"
410+#ifdef ENABLE_AVX512
411+#include "nnacl/avx512/add_fp32_avx512.h"
412+#endif
413+
414+#ifdef ENABLE_AVX
415+#include "nnacl/avx/add_fp32_avx.h"
416+#endif
417+
418+#ifdef ENABLE_SSE
419+#include "nnacl/sse/add_fp32_sse.h"
420+#endif
421+
422+#ifdef ENABLE_ARM
423+#include "nnacl/neon/add_fp32_neon.h"
424+#endif
425+
426+#endif
427diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h
428new file mode 100644
429index 00000000..898fe882
430--- /dev/null
431+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h
432@@ -0,0 +1,36 @@
433+/**
434+ * Copyright 2022 Huawei Technologies Co., Ltd
435+ *
436+ * Licensed under the Apache License, Version 2.0 (the "License");
437+ * you may not use this file except in compliance with the License.
438+ * You may obtain a copy of the License at
439+ *
440+ * http://www.apache.org/licenses/LICENSE-2.0
441+ *
442+ * Unless required by applicable law or agreed to in writing, software
443+ * distributed under the License is distributed on an "AS IS" BASIS,
444+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
445+ * See the License for the specific language governing permissions and
446+ * limitations under the License.
447+ */
448+#ifndef MINDSPORE_NNACL_ARITHMETIC_FP32_SIMD_H_
449+#define MINDSPORE_NNACL_ARITHMETIC_FP32_SIMD_H_
450+
451+#include "nnacl/intrinsics/ms_simd_instructions.h"
452+#ifdef ENABLE_AVX512
453+#include "nnacl/avx512/arithmetic_fp32_avx512.h"
454+#endif
455+
456+#ifdef ENABLE_AVX
457+#include "nnacl/avx/arithmetic_fp32_avx.h"
458+#endif
459+
460+#ifdef ENABLE_SSE
461+#include "nnacl/sse/arithmetic_fp32_sse.h"
462+#endif
463+
464+#ifdef ENABLE_ARM
465+#include "nnacl/neon/arithmetic_fp32_neon.h"
466+#endif
467+
468+#endif
469diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h
470new file mode 100644
471index 00000000..676b53ec
472--- /dev/null
473+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h
474@@ -0,0 +1,36 @@
475+/**
476+ * Copyright 2022 Huawei Technologies Co., Ltd
477+ *
478+ * Licensed under the Apache License, Version 2.0 (the "License");
479+ * you may not use this file except in compliance with the License.
480+ * You may obtain a copy of the License at
481+ *
482+ * http://www.apache.org/licenses/LICENSE-2.0
483+ *
484+ * Unless required by applicable law or agreed to in writing, software
485+ * distributed under the License is distributed on an "AS IS" BASIS,
486+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
487+ * See the License for the specific language governing permissions and
488+ * limitations under the License.
489+ */
490+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_FP32_SIMD_H_
491+#define MINDSPORE_NNACL_ARITHMETIC_SELF_FP32_SIMD_H_
492+
493+#include "nnacl/intrinsics/ms_simd_instructions.h"
494+#ifdef ENABLE_AVX512
495+#include "nnacl/avx512/arithmetic_self_fp32_avx512.h"
496+#endif
497+
498+#ifdef ENABLE_AVX
499+#include "nnacl/avx/arithmetic_self_fp32_avx.h"
500+#endif
501+
502+#ifdef ENABLE_SSE
503+#include "nnacl/sse/arithmetic_self_fp32_sse.h"
504+#endif
505+
506+#ifdef ENABLE_ARM
507+#include "nnacl/neon/arithmetic_self_fp32_neon.h"
508+#endif
509+
510+#endif
511diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h
512new file mode 100644
513index 00000000..49edf7ec
514--- /dev/null
515+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h
516@@ -0,0 +1,221 @@
517+/**
518+ * Copyright 2022 Huawei Technologies Co., Ltd
519+ *
520+ * Licensed under the Apache License, Version 2.0 (the "License");
521+ * you may not use this file except in compliance with the License.
522+ * You may obtain a copy of the License at
523+ *
524+ * http://www.apache.org/licenses/LICENSE-2.0
525+ *
526+ * Unless required by applicable law or agreed to in writing, software
527+ * distributed under the License is distributed on an "AS IS" BASIS,
528+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
529+ * See the License for the specific language governing permissions and
530+ * limitations under the License.
531+ */
532+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
533+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
534+
535+#include "nnacl/intrinsics/ms_simd_instructions.h"
536+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
537+
538+#ifdef __cplusplus
539+extern "C" {
540+#endif
541+#pragma GCC push_options
542+#pragma GCC target("avx", "avx2")
543+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
544+#define BLOCK_NUM 8
545+#define MS_SIMD_AVX
546+
547+static inline int Fp32ReluAVX(int index, const float *src, int length, float *dst) {
548+    SIMD_F32 zero = SIMD_SET0_F32;
549+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
550+        SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero));
551+    }
552+    return index;
553+}
554+
555+static inline int Int32ReluAVX(int index, const int32_t *src, int length, int32_t *dst) {
556+    SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f);
557+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
558+        SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero));
559+    }
560+    return index;
561+}
562+
563+static inline int Fp32Relu6AVX(int index, const float *src, int length, float *dst) {
564+    SIMD_F32 zero = SIMD_SET0_F32;
565+    SIMD_F32 six = SIMD_MOV_F32(6.0f);
566+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
567+        SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six));
568+    }
569+    return index;
570+}
571+
572+static inline int LReluAVX(int index, const float *src, int length, float *dst, float alpha) {
573+    SIMD_F32 alpha_data = SIMD_MOV_F32(alpha);
574+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
575+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
576+        SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp);
577+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask));
578+    }
579+    return index;
580+}
581+
582+static inline int SigmoidAVX(int index, const float *src, int length, float *dst) {
583+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
584+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index);
585+        SIMD_ST_F32(dst + index,
586+                    SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
587+    }
588+    return index;
589+}
590+
591+static inline int TanhAVX(int index, const float *src, int length, float *dst) {
592+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
593+        SIMD_F32 input = SIMD_LD_F32(src + index);
594+        SIMD_ST_F32(dst + index, SIMD_TANH_F32(input));
595+    }
596+    return index;
597+}
598+
599+static inline int SwishAVX(int index, const float *src, int length, float *dst) {
600+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
601+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
602+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index);
603+        SIMD_ST_F32(dst + index,
604+                    SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
605+    }
606+    return index;
607+}
608+
609+static inline int HSwishAVX(int index, const float *src, int length, float *dst) {
610+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
611+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
612+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
613+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6));
614+    }
615+    return index;
616+}
617+
618+static inline int HSigmoidAVX(int index, const float *src, int length, float *dst) {
619+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
620+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
621+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
622+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6));
623+    }
624+    return index;
625+}
626+
627+static inline int HardTanhNoLimitMinAVX(int index, const float *src, int length, float *dst, float min_val,
628+                                            float max_val) {
629+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
630+        SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val));
631+    }
632+    return index;
633+}
634+
635+static inline int HardTanhNoLimitMaxAVX(int index, const float *src, int length, float *dst, float min_val,
636+                                            float max_val) {
637+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
638+        SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val));
639+    }
640+    return index;
641+}
642+
643+static inline int HardTanhLimitMinMaxAVX(int index, const float *src, int length, float *dst, float min_val,
644+                                             float max_val) {
645+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
646+        SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val));
647+    }
648+    return index;
649+}
650+
651+static inline int GeluApproximateAVX(int index, const float *src, int length, float *dst) {
652+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
653+        SIMD_F32 in = SIMD_LD_F32(src + index);
654+        SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in);
655+        SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in);
656+        SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f)));
657+    }
658+    return index;
659+}
660+
661+static inline int GeluAVX(int index, const float *src, int length, float *dst) {
662+    SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f);
663+    SIMD_F32 para2 = SIMD_MOV_F32(1.0f);
664+    SIMD_F32 para3 = SIMD_MOV_F32(0.5f);
665+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
666+      SIMD_F32 in = SIMD_LD_F32(src + index);
667+      SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1))));
668+      SIMD_ST_F32(dst + index, res);
669+    }
670+    return index;
671+}
672+
673+static inline int EluAVX(int index, const float *src, int length, float *dst, float alpha) {
674+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
675+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
676+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f);
677+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
678+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
679+    }
680+    return index;
681+}
682+
683+static inline int CeluAVX(int index, const float *src, int length, float *dst, float alpha) {
684+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
685+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
686+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f);
687+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
688+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
689+    }
690+    return index;
691+}
692+
693+static inline int HShrinkAVX(int index, const float *src, int length, float *dst, float lambd) {
694+    const float neg_lambd = -1 * lambd;
695+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
696+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
697+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd));
698+        SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp);
699+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
700+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask));
701+    }
702+    return index;
703+}
704+
705+static inline int SoftShrinkAVX(int index, const float *src, int length, float *dst, float lambd) {
706+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
707+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
708+
709+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
710+        SIMD_F32 src_t = SIMD_LD_F32(src + index);
711+        /* v0 = (in > lamdb) & (in - lamdb) */
712+        SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v));
713+        /* v1 = (in < -lamdb) & (in + lamdb) */
714+        SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v));
715+        /* out = (v0 | v1) */
716+        SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1));
717+    }
718+    return index;
719+}
720+
721+static inline int SoftsignFp32OptAVX(int index, const float *src, int length, float *dst) {
722+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
723+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
724+        SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp));
725+        SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp));
726+    }
727+    return index;
728+}
729+
730+#undef MS_SIMD_INSTRUCTION
731+#undef BLOCK_NUM
732+#pragma GCC pop_options
733+#undef MS_SIMD_AVX
734+#ifdef __cplusplus
735+}
736+#endif
737+#endif
738diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h
739new file mode 100644
740index 00000000..435d24c5
741--- /dev/null
742+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h
743@@ -0,0 +1,57 @@
744+/**
745+ * Copyright 2022 Huawei Technologies Co., Ltd
746+ *
747+ * Licensed under the Apache License, Version 2.0 (the "License");
748+ * you may not use this file except in compliance with the License.
749+ * You may obtain a copy of the License at
750+ *
751+ * http://www.apache.org/licenses/LICENSE-2.0
752+ *
753+ * Unless required by applicable law or agreed to in writing, software
754+ * distributed under the License is distributed on an "AS IS" BASIS,
755+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
756+ * See the License for the specific language governing permissions and
757+ * limitations under the License.
758+ */
759+
760+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX_H_
761+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX_H_
762+
763+#include "nnacl/intrinsics/ms_simd_instructions.h"
764+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
765+
766+#ifdef __cplusplus
767+extern "C" {
768+#endif
769+#pragma GCC push_options
770+#pragma GCC target("avx", "avx2")
771+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
772+#define BLOCK_NUM 8
773+#define MS_SIMD_AVX
774+
775+static inline int ShrinkGradAVX(int index, const float *src0, const float *src1,
776+                                               int length, float *dst, float lambd) {
777+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
778+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
779+
780+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
781+        SIMD_F32 src0_t = SIMD_LD_F32(src0 + index);
782+        SIMD_F32 src1_t = SIMD_LD_F32(src1 + index);
783+
784+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v);
785+        SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t);
786+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
787+
788+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask));
789+    }
790+    return index;
791+}
792+
793+#undef MS_SIMD_INSTRUCTION
794+#undef BLOCK_NUM
795+#pragma GCC pop_options
796+#undef MS_SIMD_AVX
797+#ifdef __cplusplus
798+}
799+#endif
800+#endif
801diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h
802new file mode 100644
803index 00000000..54743d80
804--- /dev/null
805+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h
806@@ -0,0 +1,210 @@
807+/**
808+ * Copyright 2022 Huawei Technologies Co., Ltd
809+ *
810+ * Licensed under the Apache License, Version 2.0 (the "License");
811+ * you may not use this file except in compliance with the License.
812+ * You may obtain a copy of the License at
813+ *
814+ * http://www.apache.org/licenses/LICENSE-2.0
815+ *
816+ * Unless required by applicable law or agreed to in writing, software
817+ * distributed under the License is distributed on an "AS IS" BASIS,
818+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
819+ * See the License for the specific language governing permissions and
820+ * limitations under the License.
821+ */
822+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_AVX_H_
823+#define MINDSPORE_NNACL_FP32_ADAM_FP32_AVX_H_
824+
825+#include "nnacl/intrinsics/ms_simd_instructions.h"
826+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
827+
828+#ifdef __cplusplus
829+extern "C" {
830+#endif
831+#pragma GCC push_options
832+#pragma GCC target("avx", "avx2")
833+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
834+#define BLOCK_NUM 8
835+#define MS_SIMD_AVX
836+#ifdef MS_SIMD_AVX512
837+  static inline size_t AdamWeightDecayFp32AVX(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
838+    const float *gradient, size_t end) {
839+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
840+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
841+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
842+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
843+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
844+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
845+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
846+
847+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
848+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
849+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
850+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
851+    SIMD_F32 g_r = SIMD_LD_F32(gradient + index);
852+
853+    m_r = SIMD_MUL_F32(m_r, beta1_r);
854+    v_r = SIMD_MUL_F32(v_r, beta2_r);
855+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
856+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
857+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
858+    avx_r0 = SIMD_SQRT_F32(v_r);
859+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
860+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
861+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
862+    SIMD_ST_F32(m + index, m_r);
863+    SIMD_ST_F32(v + index, v_r);
864+    SIMD_ST_F32(var + index, var_r);
865+  }
866+
867+  return index;
868+}
869+
870+static inline size_t FusedCastAdamFp32Fp16AVX(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
871+    float global_norm_reciprocal, size_t end) {
872+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
873+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
874+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
875+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
876+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
877+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
878+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
879+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
880+
881+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
882+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
883+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
884+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
885+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
886+
887+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
888+    m_r = SIMD_MUL_F32(m_r, beta1_r);
889+    v_r = SIMD_MUL_F32(v_r, beta2_r);
890+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
891+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
892+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
893+    avx_r0 = SIMD_SQRT_F32(v_r);
894+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
895+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
896+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
897+    SIMD_ST_F32(var + index, var_r);
898+    SIMD_ST_F32(m + index, m_r);
899+    SIMD_ST_F32(v + index, v_r);
900+  }
901+
902+  return index;
903+}
904+
905+static inline size_t FusedCastAdamFp32Fp32AVX(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
906+    float global_norm_reciprocal, size_t end) {
907+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
908+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
909+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
910+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
911+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
912+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
913+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
914+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
915+
916+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
917+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
918+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
919+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
920+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
921+
922+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
923+    m_r = SIMD_MUL_F32(m_r, beta1_r);
924+    v_r = SIMD_MUL_F32(v_r, beta2_r);
925+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
926+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
927+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
928+    avx_r0 = SIMD_SQRT_F32(v_r);
929+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
930+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
931+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
932+    SIMD_ST_F32(var + index, var_r);
933+    SIMD_ST_F32(m + index, m_r);
934+    SIMD_ST_F32(v + index, v_r);
935+  }
936+
937+  return index;
938+}
939+
940+static inline size_t FusedCastAdamFp16Fp16AVX(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
941+    float global_norm_reciprocal, size_t end) {
942+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
943+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
944+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
945+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
946+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
947+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
948+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
949+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
950+
951+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
952+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
953+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
954+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
955+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
956+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
957+    m_r = SIMD_MUL_F32(m_r, beta1_r);
958+    v_r = SIMD_MUL_F32(v_r, beta2_r);
959+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
960+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
961+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
962+    avx_r0 = SIMD_SQRT_F32(v_r);
963+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
964+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
965+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
966+    SIMD_ST_F32(m + index, m_r);
967+    SIMD_ST_F32(v + index, v_r);
968+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
969+  }
970+
971+  return index;
972+}
973+
974+static inline size_t FusedCastAdamFp16Fp32AVX(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
975+    float global_norm_reciprocal, size_t end) {
976+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
977+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
978+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
979+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
980+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
981+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
982+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
983+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
984+
985+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
986+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
987+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
988+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
989+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
990+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
991+    m_r = SIMD_MUL_F32(m_r, beta1_r);
992+    v_r = SIMD_MUL_F32(v_r, beta2_r);
993+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
994+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
995+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
996+    avx_r0 = SIMD_SQRT_F32(v_r);
997+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
998+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
999+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
1000+    SIMD_ST_F32(m + index, m_r);
1001+    SIMD_ST_F32(v + index, v_r);
1002+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
1003+  }
1004+
1005+  return index;
1006+}
1007+#endif
1008+
1009+#undef MS_SIMD_INSTRUCTION
1010+#undef BLOCK_NUM
1011+#pragma GCC pop_options
1012+#undef MS_SIMD_AVX
1013+#ifdef __cplusplus
1014+}
1015+#endif
1016+#endif
1017diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h
1018new file mode 100644
1019index 00000000..716c25b1
1020--- /dev/null
1021+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h
1022@@ -0,0 +1,124 @@
1023+/**
1024+ * Copyright 2022 Huawei Technologies Co., Ltd
1025+ *
1026+ * Licensed under the Apache License, Version 2.0 (the "License");
1027+ * you may not use this file except in compliance with the License.
1028+ * You may obtain a copy of the License at
1029+ *
1030+ * http://www.apache.org/licenses/LICENSE-2.0
1031+ *
1032+ * Unless required by applicable law or agreed to in writing, software
1033+ * distributed under the License is distributed on an "AS IS" BASIS,
1034+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1035+ * See the License for the specific language governing permissions and
1036+ * limitations under the License.
1037+ */
1038+
1039+#ifndef MINDSPORE_NNACL_FP32_ADD_AVX_H_
1040+#define MINDSPORE_NNACL_FP32_ADD_AVX_H_
1041+
1042+#include "nnacl/intrinsics/ms_simd_instructions.h"
1043+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
1044+
1045+#ifdef __cplusplus
1046+extern "C" {
1047+#endif
1048+#pragma GCC push_options
1049+#pragma GCC target("avx", "avx2")
1050+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
1051+#define BLOCK_NUM 8
1052+#define MS_SIMD_AVX
1053+
1054+static inline int ElementOptAddAVX(int index, const float *in0, const float *in1, float *out, int size) {
1055+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
1056+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1057+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
1058+    SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1);
1059+    SIMD_ST_F32(out + index, vout);
1060+  }
1061+  return index;
1062+}
1063+
1064+static inline int ElementOptAddIntAVX(int index, const int *in0, const int *in1, int *out,
1065+                                                     int size) {
1066+  SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]);
1067+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1068+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
1069+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1);
1070+    SIMD_ST_EPI32(out + index, vout);
1071+  }
1072+  return index;
1073+}
1074+
1075+static inline int ElementOptAddReluAVX(int index, const float *in0, const float *in1, float *out,
1076+                                                      int size) {
1077+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
1078+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1079+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
1080+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f);
1081+    SIMD_ST_F32(out + index, vout);
1082+  }
1083+  return index;
1084+}
1085+
1086+static inline int ElementOptAddRelu6AVX(int index, const float *in0, const float *in1, float *out,
1087+                                                       int size) {
1088+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
1089+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1090+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
1091+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f);
1092+    SIMD_ST_F32(out + index, vout);
1093+  }
1094+  return index;
1095+}
1096+
1097+static inline int ElementAddAVX(int index, const float *in0, const float *in1, float *out, int size) {
1098+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1099+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
1100+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
1101+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
1102+    SIMD_ST_F32(out + index, vout);
1103+  }
1104+  return index;
1105+}
1106+
1107+static inline int ElementAddReluAVX(int index, const float *in0, const float *in1, float *out,
1108+                                                   int size) {
1109+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1110+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
1111+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
1112+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f);
1113+    SIMD_ST_F32(out + index, vout);
1114+  }
1115+  return index;
1116+}
1117+
1118+static inline int ElementAddRelu6AVX(int index, const float *in0, const float *in1, float *out,
1119+                                                    int size) {
1120+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1121+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
1122+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
1123+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f);
1124+    SIMD_ST_F32(out + index, vout);
1125+  }
1126+  return index;
1127+}
1128+
1129+static inline int ElementAddIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
1130+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1131+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
1132+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
1133+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1);
1134+    SIMD_ST_EPI32(out + index, vout);
1135+  }
1136+  return index;
1137+}
1138+
1139+#undef MS_SIMD_INSTRUCTION
1140+#undef BLOCK_NUM
1141+#pragma GCC pop_options
1142+#undef MS_SIMD_AVX
1143+#ifdef __cplusplus
1144+}
1145+#endif
1146+#endif
1147diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h
1148new file mode 100644
1149index 00000000..9dd24100
1150--- /dev/null
1151+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h
1152@@ -0,0 +1,254 @@
1153+/**
1154+ * Copyright 2022 Huawei Technologies Co., Ltd
1155+ *
1156+ * Licensed under the Apache License, Version 2.0 (the "License");
1157+ * you may not use this file except in compliance with the License.
1158+ * You may obtain a copy of the License at
1159+ *
1160+ * http://www.apache.org/licenses/LICENSE-2.0
1161+ *
1162+ * Unless required by applicable law or agreed to in writing, software
1163+ * distributed under the License is distributed on an "AS IS" BASIS,
1164+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1165+ * See the License for the specific language governing permissions and
1166+ * limitations under the License.
1167+ */
1168+
1169+#ifndef MINDSPORE_NNACL_ARITHMETIC_AVX_H_
1170+#define MINDSPORE_NNACL_ARITHMETIC_AVX_H_
1171+
1172+#include "nnacl/intrinsics/ms_simd_instructions.h"
1173+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
1174+
1175+#ifdef __cplusplus
1176+extern "C" {
1177+#endif
1178+#pragma GCC push_options
1179+#pragma GCC target("avx", "avx2")
1180+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
1181+#define BLOCK_NUM 8
1182+#define MS_SIMD_AVX
1183+
1184+#ifndef MS_SIMD_NEON
1185+static inline int ElementFloorModAVX(int index, const float *in0, const float *in1, float *out, int size) {
1186+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1187+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
1188+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
1189+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
1190+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
1191+    SIMD_ST_F32(out + index, out_tmp);
1192+  }
1193+  return index;
1194+}
1195+
1196+static inline int ElementOptFloorModNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
1197+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
1198+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1199+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
1200+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
1201+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
1202+    SIMD_ST_F32(out + index, out_tmp);
1203+  }
1204+  return index;
1205+}
1206+
1207+static inline int ElementOptFloorModNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
1208+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
1209+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1210+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
1211+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
1212+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
1213+    SIMD_ST_F32(out + index, out_tmp);
1214+  }
1215+  return index;
1216+}
1217+
1218+static inline int ElementFloorDivAVX(int index, const float *in0, const float *in1, float *out, int size) {
1219+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1220+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
1221+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
1222+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
1223+    SIMD_ST_F32(out + index, floor_tmp);
1224+  }
1225+  return index;
1226+}
1227+
1228+static inline int ElementOptFloorDivNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
1229+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
1230+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1231+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
1232+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
1233+    SIMD_ST_F32(out + index, out_tmp);
1234+  }
1235+  return index;
1236+}
1237+
1238+static inline int ElementOptFloorDivNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
1239+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
1240+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1241+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
1242+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
1243+    SIMD_ST_F32(out + index, out_tmp);
1244+  }
1245+  return index;
1246+}
1247+#endif
1248+
1249+static inline int ElementFloorDivIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
1250+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1251+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
1252+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
1253+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
1254+    SIMD_ST_EPI32(out + index, out_tmp);
1255+  }
1256+  return index;
1257+}
1258+
1259+static inline int ElementOptFloorDivIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
1260+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
1261+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1262+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
1263+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
1264+    SIMD_ST_EPI32(out + index, out_tmp);
1265+  }
1266+  return index;
1267+}
1268+
1269+static inline int ElementOptFloorDivIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
1270+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
1271+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1272+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
1273+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
1274+    SIMD_ST_EPI32(out + index, out_tmp);
1275+  }
1276+  return index;
1277+}
1278+
1279+static inline int ElementMaximumAVX(int index, const float *in0, const float *in1, float *out, int size) {
1280+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1281+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
1282+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
1283+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
1284+    SIMD_ST_F32(out + index, out_tmp);
1285+  }
1286+  return index;
1287+}
1288+
1289+static inline int ElementOptMaximumNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
1290+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
1291+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1292+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
1293+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
1294+    SIMD_ST_F32(out + index, out_tmp);
1295+  }
1296+  return index;
1297+}
1298+
1299+static inline int ElementOptMaximumNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
1300+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
1301+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1302+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
1303+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
1304+    SIMD_ST_F32(out + index, out_tmp);
1305+  }
1306+  return index;
1307+}
1308+
1309+static inline int ElementMaximumIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
1310+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1311+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
1312+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
1313+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
1314+    SIMD_ST_EPI32(out + index, out_tmp);
1315+  }
1316+  return index;
1317+}
1318+
1319+static inline int ElementOptMaximumIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
1320+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
1321+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1322+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
1323+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
1324+    SIMD_ST_EPI32(out + index, out_tmp);
1325+  }
1326+  return index;
1327+}
1328+
1329+static inline int ElementOptMaximumIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
1330+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
1331+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1332+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
1333+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
1334+    SIMD_ST_EPI32(out + index, out_tmp);
1335+  }
1336+  return index;
1337+}
1338+
1339+static inline int ElementMinimumIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
1340+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1341+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
1342+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
1343+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
1344+    SIMD_ST_EPI32(out + index, out_tmp);
1345+  }
1346+  return index;
1347+}
1348+
1349+static inline int ElementOptMinimumIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
1350+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
1351+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1352+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
1353+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
1354+    SIMD_ST_EPI32(out + index, out_tmp);
1355+  }
1356+  return index;
1357+}
1358+
1359+static inline int ElementOptMinimumIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
1360+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
1361+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1362+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
1363+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
1364+    SIMD_ST_EPI32(out + index, out_tmp);
1365+  }
1366+  return index;
1367+}
1368+
1369+static inline int ElementMinimumAVX(int index, const float *in0, const float *in1, float *out, int size) {
1370+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1371+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
1372+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
1373+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
1374+    SIMD_ST_F32(out + index, out_tmp);
1375+  }
1376+  return index;
1377+}
1378+
1379+static inline int ElementOptMinimumNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
1380+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
1381+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1382+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
1383+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
1384+    SIMD_ST_F32(out + index, out_tmp);
1385+  }
1386+  return index;
1387+}
1388+
1389+static inline int ElementOptMinimumNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
1390+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
1391+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1392+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
1393+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
1394+    SIMD_ST_F32(out + index, out_tmp);
1395+  }
1396+  return index;
1397+}
1398+
1399+#undef MS_SIMD_INSTRUCTION
1400+#undef BLOCK_NUM
1401+#pragma GCC pop_options
1402+#undef MS_SIMD_AVX
1403+#ifdef __cplusplus
1404+}
1405+#endif
1406+#endif
1407diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h
1408new file mode 100644
1409index 00000000..c48500f4
1410--- /dev/null
1411+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h
1412@@ -0,0 +1,129 @@
1413+/**
1414+ * Copyright 2022 Huawei Technologies Co., Ltd
1415+ *
1416+ * Licensed under the Apache License, Version 2.0 (the "License");
1417+ * you may not use this file except in compliance with the License.
1418+ * You may obtain a copy of the License at
1419+ *
1420+ * http://www.apache.org/licenses/LICENSE-2.0
1421+ *
1422+ * Unless required by applicable law or agreed to in writing, software
1423+ * distributed under the License is distributed on an "AS IS" BASIS,
1424+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1425+ * See the License for the specific language governing permissions and
1426+ * limitations under the License.
1427+ */
1428+
1429+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_AVX_H_
1430+#define MINDSPORE_NNACL_ARITHMETIC_SELF_AVX_H_
1431+
1432+#include "nnacl/intrinsics/ms_simd_instructions.h"
1433+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
1434+
1435+#ifdef __cplusplus
1436+extern "C" {
1437+#endif
1438+#pragma GCC push_options
1439+#pragma GCC target("avx", "avx2")
1440+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
1441+#define BLOCK_NUM 8
1442+#define MS_SIMD_AVX
1443+
1444+#if defined(MS_SIMD_AVX512)
1445+// only avx512 support abs fp32 instruction
1446+static inline int ElementAbsAVX(int index, const float *input, float *output, const int element_size) {
1447+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1448+    SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index)));
1449+  }
1450+  return index;
1451+}
1452+
1453+static inline int ElementAbsIntAVX(int index, const int *input, int *output, const int element_size) {
1454+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1455+    SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index)));
1456+  }
1457+  return index;
1458+}
1459+#endif
1460+
1461+static inline int ElementSquareAVX(int index, const float *input, float *output, const int element_size) {
1462+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1463+    SIMD_F32 vin = SIMD_LD_F32(input + index);
1464+    SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin));
1465+  }
1466+  return index;
1467+}
1468+
1469+static inline int ElementSqrtAVX(int index, const float *input, float *output, const int element_size) {
1470+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1471+    SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index)));
1472+  }
1473+  return index;
1474+}
1475+
1476+static inline int ElementRsqrtAVX(int index, const float *input, float *output, const int element_size) {
1477+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1478+    SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index)));
1479+  }
1480+  return index;
1481+}
1482+
1483+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE)
1484+// avx512 dont support round fp32 instruction
1485+static inline int ElementRoundAVX(int index, const float *input, float *output, const int element_size) {
1486+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1487+    SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index)));
1488+  }
1489+  return index;
1490+}
1491+#endif
1492+
1493+#ifndef MS_SIMD_NEON
1494+// neon dont support floor fp32 instruction
1495+static inline int ElementFloorAVX(int index, const float *input, float *output, const int element_size) {
1496+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1497+    SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index)));
1498+  }
1499+  return index;
1500+}
1501+#endif
1502+
1503+#ifndef MS_SIMD_NEON
1504+static inline int ElementCeilAVX(int index, const float *input, float *output, const int element_size) {
1505+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1506+    SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index)));
1507+  }
1508+  return index;
1509+}
1510+#endif
1511+
1512+static inline int ElementNegativeAVX(int index, const float *input, float *output, const int element_size) {
1513+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1514+    SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f));
1515+  }
1516+  return index;
1517+}
1518+
1519+static inline int ElementNegativeIntAVX(int index, const int *input, int *output, const int element_size) {
1520+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1521+    SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1));
1522+  }
1523+  return index;
1524+}
1525+
1526+static inline int ElementReciprocalAVX(int index, const float *input, float *output, const int element_size) {
1527+  SIMD_F32 num1 = SIMD_MOV_F32(1.0f);
1528+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1529+    SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index)));
1530+  }
1531+  return index;
1532+}
1533+
1534+#undef MS_SIMD_INSTRUCTION
1535+#undef BLOCK_NUM
1536+#pragma GCC pop_options
1537+#undef MS_SIMD_AVX
1538+#ifdef __cplusplus
1539+}
1540+#endif
1541+#endif
1542diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h
1543new file mode 100644
1544index 00000000..11a9087b
1545--- /dev/null
1546+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h
1547@@ -0,0 +1,67 @@
1548+/**
1549+ * Copyright 2022 Huawei Technologies Co., Ltd
1550+ *
1551+ * Licensed under the Apache License, Version 2.0 (the "License");
1552+ * you may not use this file except in compliance with the License.
1553+ * You may obtain a copy of the License at
1554+ *
1555+ * http://www.apache.org/licenses/LICENSE-2.0
1556+ *
1557+ * Unless required by applicable law or agreed to in writing, software
1558+ * distributed under the License is distributed on an "AS IS" BASIS,
1559+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1560+ * See the License for the specific language governing permissions and
1561+ * limitations under the License.
1562+ */
1563+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
1564+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
1565+
1566+#include "nnacl/intrinsics/ms_simd_instructions.h"
1567+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
1568+
1569+#ifdef __cplusplus
1570+extern "C" {
1571+#endif
1572+#pragma GCC push_options
1573+#pragma GCC target("avx", "avx2")
1574+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
1575+#define BLOCK_NUM 8
1576+#define MS_SIMD_AVX
1577+
1578+static inline int BatchNormFp32AVX(int index, const float *input, const float *mean,
1579+  const float *variance, int channel, float epsilon, float *output) {
1580+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1581+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
1582+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
1583+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
1584+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
1585+    SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
1586+    SIMD_ST_F32(output + index, output_data);
1587+  }
1588+  return index;
1589+}
1590+
1591+static inline int FusedBatchNormFp32AVX(int index, const float *input, const float *scale,
1592+  const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) {
1593+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1594+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
1595+    SIMD_F32 scale_ = SIMD_LD_F32(scale + index);
1596+    SIMD_F32 offset_ = SIMD_LD_F32(offset + index);
1597+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
1598+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
1599+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
1600+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
1601+    SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_);
1602+    SIMD_ST_F32(output + index, output_data);
1603+  }
1604+  return index;
1605+}
1606+
1607+#undef MS_SIMD_INSTRUCTION
1608+#undef BLOCK_NUM
1609+#pragma GCC pop_options
1610+#undef MS_SIMD_AVX
1611+#ifdef __cplusplus
1612+}
1613+#endif
1614+#endif
1615diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h
1616new file mode 100644
1617index 00000000..9da68a79
1618--- /dev/null
1619+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h
1620@@ -0,0 +1,69 @@
1621+/**
1622+ * Copyright 2022 Huawei Technologies Co., Ltd
1623+ *
1624+ * Licensed under the Apache License, Version 2.0 (the "License");
1625+ * you may not use this file except in compliance with the License.
1626+ * You may obtain a copy of the License at
1627+ *
1628+ * http://www.apache.org/licenses/LICENSE-2.0
1629+ *
1630+ * Unless required by applicable law or agreed to in writing, software
1631+ * distributed under the License is distributed on an "AS IS" BASIS,
1632+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1633+ * See the License for the specific language governing permissions and
1634+ * limitations under the License.
1635+ */
1636+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX_H_
1637+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX_H_
1638+
1639+#include "nnacl/intrinsics/ms_simd_instructions.h"
1640+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
1641+
1642+#ifdef __cplusplus
1643+extern "C" {
1644+#endif
1645+#pragma GCC push_options
1646+#pragma GCC target("avx", "avx2")
1647+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
1648+#define BLOCK_NUM 8
1649+#define MS_SIMD_AVX
1650+
1651+static inline int BCEWithLogitLossAVX(int index, const float *logits, const float *label,
1652+    const float *weight, const float *pos_weight, int length, bool reduction, float *output,
1653+    float *reduction_sum) {
1654+    SIMD_F32 zero = SIMD_SET0_F32;
1655+    SIMD_F32 ones = SIMD_MOV_F32(1.0f);
1656+    SIMD_F32 middle_output = SIMD_SET0_F32;
1657+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1658+      SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index);
1659+      SIMD_F32 label_tmp = SIMD_LD_F32(label + index);
1660+      SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index);
1661+      SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index);
1662+      SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp);
1663+      SIMD_F32 max_value = neg_logits_tmp;
1664+      max_value = SIMD_MIN_F32(max_value, zero);
1665+      SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value);
1666+      SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones);
1667+      SIMD_F32 log_exp_value =
1668+        SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value))));
1669+      SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp),
1670+                                    SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value)));
1671+      if (reduction) {
1672+        middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output);
1673+      } else {
1674+        SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp));
1675+      }
1676+    }
1677+    if (reduction) {
1678+      *reduction_sum += SIMD_GET_SUM_F32(middle_output);
1679+    }
1680+    return index;
1681+}
1682+#undef MS_SIMD_INSTRUCTION
1683+#undef BLOCK_NUM
1684+#pragma GCC pop_options
1685+#undef MS_SIMD_AVX
1686+#ifdef __cplusplus
1687+}
1688+#endif
1689+#endif
1690diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h
1691new file mode 100644
1692index 00000000..e54588bb
1693--- /dev/null
1694+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h
1695@@ -0,0 +1,64 @@
1696+/**
1697+ * Copyright 2022 Huawei Technologies Co., Ltd
1698+ *
1699+ * Licensed under the Apache License, Version 2.0 (the "License");
1700+ * you may not use this file except in compliance with the License.
1701+ * You may obtain a copy of the License at
1702+ *
1703+ * http://www.apache.org/licenses/LICENSE-2.0
1704+ *
1705+ * Unless required by applicable law or agreed to in writing, software
1706+ * distributed under the License is distributed on an "AS IS" BASIS,
1707+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1708+ * See the License for the specific language governing permissions and
1709+ * limitations under the License.
1710+ */
1711+
1712+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_AVX_H_
1713+#define MINDSPORE_NNACL_FP32_BIAS_ADD_AVX_H_
1714+
1715+#include "nnacl/intrinsics/ms_simd_instructions.h"
1716+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
1717+
1718+#ifdef __cplusplus
1719+extern "C" {
1720+#endif
1721+#pragma GCC push_options
1722+#pragma GCC target("avx", "avx2")
1723+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
1724+#define BLOCK_NUM 8
1725+#define MS_SIMD_AVX
1726+
1727+static inline int BiasAddByInnerCoreAVX(int index, const float *input, const float *bias, float *output,
1728+                                                       int64_t num) {
1729+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1730+    SIMD_F32 vin0 = SIMD_LD_F32(input + index);
1731+    SIMD_F32 vin1 = SIMD_LD_F32(bias + index);
1732+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
1733+    SIMD_ST_F32(output + index, vout);
1734+  }
1735+  return index;
1736+}
1737+
1738+static inline int BiasAddByBatchCoreAVX(int index, const float *input, const float *bias, float *output1,
1739+                                                       float *output2, float *output3, float *output4, int64_t num) {
1740+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1741+    SIMD_LDX4_F32(input_data, input + index, num);
1742+    SIMD_F32 bias_data = SIMD_LD_F32(bias + index);
1743+    SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data));
1744+    SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data));
1745+    SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data));
1746+    SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data));
1747+  }
1748+  return index;
1749+}
1750+
1751+#undef MS_SIMD_INSTRUCTION
1752+#undef BLOCK_NUM
1753+#pragma GCC pop_options
1754+#undef MS_SIMD_AVX
1755+#ifdef __cplusplus
1756+};
1757+#endif
1758+
1759+#endif  // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_
1760diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h
1761new file mode 100644
1762index 00000000..44176549
1763--- /dev/null
1764+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h
1765@@ -0,0 +1,56 @@
1766+/**
1767+ * Copyright 2022 Huawei Technologies Co., Ltd
1768+ *
1769+ * Licensed under the Apache License, Version 2.0 (the "License");
1770+ * you may not use this file except in compliance with the License.
1771+ * You may obtain a copy of the License at
1772+ *
1773+ * http://www.apache.org/licenses/LICENSE-2.0
1774+ *
1775+ * Unless required by applicable law or agreed to in writing, software
1776+ * distributed under the License is distributed on an "AS IS" BASIS,
1777+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1778+ * See the License for the specific language governing permissions and
1779+ * limitations under the License.
1780+ */
1781+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_AVX_H_
1782+#define MINDSPORE_NNACL_BASE_CAST_BASE_AVX_H_
1783+
1784+#include "nnacl/intrinsics/ms_simd_instructions.h"
1785+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
1786+
1787+#ifdef __cplusplus
1788+extern "C" {
1789+#endif
1790+#pragma GCC push_options
1791+#pragma GCC target("avx", "avx2")
1792+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
1793+#define BLOCK_NUM 8
1794+#define MS_SIMD_AVX
1795+
1796+static inline int Int32ToFloat32AVX(int index, const int32_t *input, float *output, int number) {
1797+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1798+    SIMD_EPI32 value = SIMD_LD_EPI32(input + index);
1799+    SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value));
1800+  }
1801+  return index;
1802+}
1803+
1804+#ifndef MS_SIMD_NEON
1805+static inline int Float32ToInt32AVX(int index, const float *input, int32_t *output, int number) {
1806+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1807+    SIMD_F32 value = SIMD_LD_F32(input + index);
1808+    SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value));
1809+  }
1810+  return index;
1811+}
1812+#endif
1813+
1814+#undef MS_SIMD_INSTRUCTION
1815+#undef BLOCK_NUM
1816+#pragma GCC pop_options
1817+#undef MS_SIMD_AVX
1818+#ifdef __cplusplus
1819+}
1820+#endif
1821+#endif
1822diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h
1823new file mode 100644
1824index 00000000..dac9efa9
1825--- /dev/null
1826+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h
1827@@ -0,0 +1,70 @@
1828+/**
1829+ * Copyright 2022 Huawei Technologies Co., Ltd
1830+ *
1831+ * Licensed under the Apache License, Version 2.0 (the "License");
1832+ * you may not use this file except in compliance with the License.
1833+ * You may obtain a copy of the License at
1834+ *
1835+ * http://www.apache.org/licenses/LICENSE-2.0
1836+ *
1837+ * Unless required by applicable law or agreed to in writing, software
1838+ * distributed under the License is distributed on an "AS IS" BASIS,
1839+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1840+ * See the License for the specific language governing permissions and
1841+ * limitations under the License.
1842+ */
1843+#ifndef MINDSPORE_NNACL_FP32_CDIST_AVX_H_
1844+#define MINDSPORE_NNACL_FP32_CDIST_AVX_H_
1845+
1846+#include "nnacl/intrinsics/ms_simd_instructions.h"
1847+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
1848+
1849+#ifdef __cplusplus
1850+extern "C" {
1851+#endif
1852+#pragma GCC push_options
1853+#pragma GCC target("avx", "avx2")
1854+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
1855+#define BLOCK_NUM 8
1856+#define MS_SIMD_AVX
1857+
1858+static inline int64_t CdistTwoNormalOptAVX(int64_t index, const float *a, const float *b,
1859+                                                          float *out, int64_t size) {
1860+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
1861+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1862+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
1863+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
1864+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
1865+    tmp_vec = SIMD_ABS_F32(tmp_vec);
1866+    result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec);
1867+  }
1868+  *out += SIMD_GET_SUM_F32(result_vec);
1869+
1870+  return index;
1871+}
1872+
1873+static inline int64_t CdistPNormalOptAVX(int64_t index, const float *a, const float *b,
1874+                                                        float *out, int64_t size, float p) {
1875+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
1876+  SIMD_F32 p_vec = SIMD_MOV_F32(p);
1877+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1878+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
1879+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
1880+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
1881+    tmp_vec = SIMD_ABS_F32(tmp_vec);
1882+    tmp_vec = SIMD_POW_F32(tmp_vec, p_vec);
1883+    result_vec = SIMD_ADD_F32(tmp_vec, result_vec);
1884+  }
1885+  *out += SIMD_GET_SUM_F32(result_vec);
1886+
1887+  return index;
1888+}
1889+
1890+#undef MS_SIMD_INSTRUCTION
1891+#undef BLOCK_NUM
1892+#pragma GCC pop_options
1893+#undef MS_SIMD_AVX
1894+#ifdef __cplusplus
1895+}
1896+#endif
1897+#endif
1898diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h
1899new file mode 100644
1900index 00000000..7407942f
1901--- /dev/null
1902+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h
1903@@ -0,0 +1,121 @@
1904+/**
1905+ * Copyright 2022 Huawei Technologies Co., Ltd
1906+ *
1907+ * Licensed under the Apache License, Version 2.0 (the "License");
1908+ * you may not use this file except in compliance with the License.
1909+ * You may obtain a copy of the License at
1910+ *
1911+ * http://www.apache.org/licenses/LICENSE-2.0
1912+ *
1913+ * Unless required by applicable law or agreed to in writing, software
1914+ * distributed under the License is distributed on an "AS IS" BASIS,
1915+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1916+ * See the License for the specific language governing permissions and
1917+ * limitations under the License.
1918+ */
1919+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_AVX_H_
1920+#define MINDSPORE_NNACL_FP32_CUMSUM_AVX_H_
1921+
1922+#include "nnacl/intrinsics/ms_simd_instructions.h"
1923+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
1924+
1925+#ifdef __cplusplus
1926+extern "C" {
1927+#endif
1928+#pragma GCC push_options
1929+#pragma GCC target("avx", "avx2")
1930+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
1931+#define BLOCK_NUM 8
1932+#define MS_SIMD_AVX
1933+
1934+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
1935+// (a, b, c) -> (0, a,   a+b)    exclusive == true
1936+static inline int64_t CumsumOutputInitWithInputAVX(int64_t index, const float *layer_input,
1937+  float *layer_output, int inner_dim) {
1938+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1939+    SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index));
1940+  }
1941+  return index;
1942+}
1943+
1944+static inline int64_t CumsumOutputInitWithZeroAVX(int64_t index, float *layer_output, int inner_dim) {
1945+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1946+    SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f));
1947+  }
1948+  return index;
1949+}
1950+
1951+static inline int64_t CumsumAVX(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output,
1952+  int inner_dim) {
1953+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1954+    SIMD_F32 input_val = SIMD_LD_F32(layer_input + index);
1955+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index);
1956+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
1957+    SIMD_ST_F32(layer_output + index, out_val);
1958+  }
1959+  return index;
1960+}
1961+
1962+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
1963+// (a, b, c) -> (c+b, c, 0) exclusive==true
1964+static inline int64_t CumsumReverseAVX(int64_t index, const float *layer_input, float *layer_output,
1965+  float *layer_last_output, int inner_dim) {
1966+
1967+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1968+    SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1);
1969+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1);
1970+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
1971+    SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val);
1972+  }
1973+  return index;
1974+}
1975+
1976+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
1977+// (a, b, c) -> (0, a,   a+b)    exclusive == true
1978+static inline int64_t CumsumIntOutputInitWithInputAVX(int64_t index, const int *layer_input,
1979+  int *layer_output, int inner_dim) {
1980+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1981+    SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index));
1982+  }
1983+  return index;
1984+}
1985+
1986+static inline int64_t CumsumIntOutputInitWithZeroAVX(int64_t index, int *layer_output, int inner_dim) {
1987+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1988+    SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f));
1989+  }
1990+  return index;
1991+}
1992+
1993+static inline int64_t CumsumIntAVX(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
1994+  int inner_dim) {
1995+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
1996+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index);
1997+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index);
1998+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
1999+    SIMD_ST_EPI32(layer_output + index, out_val);
2000+  }
2001+  return index;
2002+}
2003+
2004+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
2005+// (a, b, c) -> (c+b, c, 0) exclusive==true
2006+static inline int64_t CumsumReverseIntAVX(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
2007+  int inner_dim) {
2008+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2009+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1);
2010+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1);
2011+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
2012+    SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val);
2013+  }
2014+  return index;
2015+}
2016+
2017+#undef MS_SIMD_INSTRUCTION
2018+#undef BLOCK_NUM
2019+#pragma GCC pop_options
2020+#undef MS_SIMD_AVX
2021+#ifdef __cplusplus
2022+}
2023+#endif
2024+#endif
2025diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h
2026new file mode 100644
2027index 00000000..3710151e
2028--- /dev/null
2029+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h
2030@@ -0,0 +1,167 @@
2031+/**
2032+ * Copyright 2022 Huawei Technologies Co., Ltd
2033+ *
2034+ * Licensed under the Apache License, Version 2.0 (the "License");
2035+ * you may not use this file except in compliance with the License.
2036+ * You may obtain a copy of the License at
2037+ *
2038+ * http://www.apache.org/licenses/LICENSE-2.0
2039+ *
2040+ * Unless required by applicable law or agreed to in writing, software
2041+ * distributed under the License is distributed on an "AS IS" BASIS,
2042+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2043+ * See the License for the specific language governing permissions and
2044+ * limitations under the License.
2045+ */
2046+
2047+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX_H_
2048+#define MINDSPORE_NNACL_FP32_DIV_AVX_H_
2049+
2050+#include "nnacl/intrinsics/ms_simd_instructions.h"
2051+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2052+
2053+#ifdef __cplusplus
2054+extern "C" {
2055+#endif
2056+#pragma GCC push_options
2057+#pragma GCC target("avx", "avx2")
2058+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2059+#define BLOCK_NUM 8
2060+#define MS_SIMD_AVX
2061+
2062+static inline int ElementOptDivNum0AVX(int index, const float *in0, const float *in1, float *out,
2063+                                                      int size) {
2064+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
2065+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2066+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2067+    SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1);
2068+    SIMD_ST_F32(out + index, vout);
2069+  }
2070+  return index;
2071+}
2072+
2073+static inline int ElementOptDivNum1AVX(int index, const float *in0, const float *in1, float *out,
2074+                                                      int size) {
2075+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
2076+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2077+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2078+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_);
2079+    SIMD_ST_F32(out + index, vout);
2080+  }
2081+  return index;
2082+}
2083+
2084+static inline int ElementOptDivIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
2085+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
2086+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2087+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
2088+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1);
2089+    SIMD_ST_EPI32(out + index, vout);
2090+  }
2091+  return index;
2092+}
2093+
2094+static inline int ElementOptDivIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
2095+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
2096+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2097+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
2098+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_);
2099+    SIMD_ST_EPI32(out + index, vout);
2100+  }
2101+  return index;
2102+}
2103+
2104+static inline int ElementOptDivReluNum0AVX(int index, const float *in0, const float *in1, float *out,
2105+                                                          int size) {
2106+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
2107+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2108+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2109+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f);
2110+    SIMD_ST_F32(out + index, vout);
2111+  }
2112+  return index;
2113+}
2114+
2115+static inline int ElementOptDivReluNum1AVX(int index, const float *in0, const float *in1, float *out,
2116+                                                          int size) {
2117+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
2118+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2119+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2120+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f);
2121+    SIMD_ST_F32(out + index, vout);
2122+  }
2123+  return index;
2124+}
2125+
2126+static inline int ElementOptDivRelu6Num0AVX(int index, const float *in0, const float *in1, float *out,
2127+                                                           int size) {
2128+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
2129+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2130+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2131+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f);
2132+    SIMD_ST_F32(out + index, vout);
2133+  }
2134+  return index;
2135+}
2136+
2137+static inline int ElementOptDivRelu6Num1AVX(int index, const float *in0, const float *in1, float *out,
2138+                                                           int size) {
2139+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
2140+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2141+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2142+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f);
2143+    SIMD_ST_F32(out + index, vout);
2144+  }
2145+  return index;
2146+}
2147+
2148+static inline int ElementDivAVX(int index, const float *in0, const float *in1, float *out, int size) {
2149+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2150+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2151+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2152+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1);
2153+    SIMD_ST_F32(out + index, vout);
2154+  }
2155+  return index;
2156+}
2157+
2158+static inline int ElementDivIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
2159+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2160+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
2161+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
2162+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1);
2163+    SIMD_ST_EPI32(out + index, vout);
2164+  }
2165+  return index;
2166+}
2167+
2168+static inline int ElementDivReluAVX(int index, const float *in0, const float *in1, float *out,
2169+                                                   int size) {
2170+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2171+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2172+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2173+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f);
2174+    SIMD_ST_F32(out + index, vout);
2175+  }
2176+  return index;
2177+}
2178+
2179+static inline int ElementDivRelu6AVX(int index, const float *in0, const float *in1, float *out,
2180+                                                    int size) {
2181+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2182+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2183+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2184+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f);
2185+    SIMD_ST_F32(out + index, vout);
2186+  }
2187+  return index;
2188+}
2189+
2190+#undef MS_SIMD_INSTRUCTION
2191+#undef BLOCK_NUM
2192+#pragma GCC pop_options
2193+#undef MS_SIMD_AVX
2194+#ifdef __cplusplus
2195+};
2196+#endif
2197+#endif
2198diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h
2199new file mode 100644
2200index 00000000..cbd4eca5
2201--- /dev/null
2202+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h
2203@@ -0,0 +1,46 @@
2204+/**
2205+ * Copyright 2022 Huawei Technologies Co., Ltd
2206+ *
2207+ * Licensed under the Apache License, Version 2.0 (the "License");
2208+ * you may not use this file except in compliance with the License.
2209+ * You may obtain a copy of the License at
2210+ *
2211+ * http://www.apache.org/licenses/LICENSE-2.0
2212+ *
2213+ * Unless required by applicable law or agreed to in writing, software
2214+ * distributed under the License is distributed on an "AS IS" BASIS,
2215+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2216+ * See the License for the specific language governing permissions and
2217+ * limitations under the License.
2218+ */
2219+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX_H_
2220+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX_H_
2221+
2222+#include "nnacl/intrinsics/ms_simd_instructions.h"
2223+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2224+
2225+#ifdef __cplusplus
2226+extern "C" {
2227+#endif
2228+#pragma GCC push_options
2229+#pragma GCC target("avx", "avx2")
2230+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2231+#define BLOCK_NUM 8
2232+#define MS_SIMD_AVX
2233+
2234+static inline int DropoutFp32AVX(int index, const float *input, float scale,
2235+    int length, float *output) {
2236+    SIMD_F32 scale_value = SIMD_MOV_F32(scale);
2237+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2238+        SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value));
2239+    }
2240+    return index;
2241+}
2242+#undef MS_SIMD_INSTRUCTION
2243+#undef BLOCK_NUM
2244+#pragma GCC pop_options
2245+#undef MS_SIMD_AVX
2246+#ifdef __cplusplus
2247+}
2248+#endif
2249+#endif
2250diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h
2251new file mode 100644
2252index 00000000..cf7cbd37
2253--- /dev/null
2254+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h
2255@@ -0,0 +1,63 @@
2256+/**
2257+ * Copyright 2022 Huawei Technologies Co., Ltd
2258+ *
2259+ * Licensed under the Apache License, Version 2.0 (the "License");
2260+ * you may not use this file except in compliance with the License.
2261+ * You may obtain a copy of the License at
2262+ *
2263+ * http://www.apache.org/licenses/LICENSE-2.0
2264+ *
2265+ * Unless required by applicable law or agreed to in writing, software
2266+ * distributed under the License is distributed on an "AS IS" BASIS,
2267+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2268+ * See the License for the specific language governing permissions and
2269+ * limitations under the License.
2270+ */
2271+
2272+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX_H_
2273+#define MINDSPORE_NNACL_FP32_DIV_AVX_H_
2274+
2275+#include "nnacl/intrinsics/ms_simd_instructions.h"
2276+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2277+
2278+#ifdef __cplusplus
2279+extern "C" {
2280+#endif
2281+#pragma GCC push_options
2282+#pragma GCC target("avx", "avx2")
2283+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2284+#define BLOCK_NUM 8
2285+#define MS_SIMD_AVX
2286+
2287+static inline int64_t ExpFp32AVX(int64_t index, const float *src, float *dst, int num) {
2288+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2289+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
2290+  }
2291+  return index;
2292+}
2293+
2294+static inline int64_t ExpFp32WithInScaleAVX(int64_t index, const float *src, float *dst, int num, float in_scale) {
2295+  SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale);
2296+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2297+    SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index);
2298+  }
2299+  return index;
2300+}
2301+
2302+static inline int64_t ExpFp32WithOutScaleAVX(int64_t index, const float *src, float *dst, int num, float out_scale) {
2303+  SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale);
2304+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2305+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
2306+    SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec));
2307+  }
2308+  return index;
2309+}
2310+
2311+#undef MS_SIMD_INSTRUCTION
2312+#undef BLOCK_NUM
2313+#pragma GCC pop_options
2314+#undef MS_SIMD_AVX
2315+#ifdef __cplusplus
2316+};
2317+#endif
2318+#endif
2319diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h
2320new file mode 100644
2321index 00000000..8b01844e
2322--- /dev/null
2323+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h
2324@@ -0,0 +1,53 @@
2325+/**
2326+ * Copyright 2022 Huawei Technologies Co., Ltd
2327+ *
2328+ * Licensed under the Apache License, Version 2.0 (the "License");
2329+ * you may not use this file except in compliance with the License.
2330+ * You may obtain a copy of the License at
2331+ *
2332+ * http://www.apache.org/licenses/LICENSE-2.0
2333+ *
2334+ * Unless required by applicable law or agreed to in writing, software
2335+ * distributed under the License is distributed on an "AS IS" BASIS,
2336+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2337+ * See the License for the specific language governing permissions and
2338+ * limitations under the License.
2339+ */
2340+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_AVX_H_
2341+#define MINDSPORE_NNACL_BASE_FILL_BASE_AVX_H_
2342+
2343+#include "nnacl/intrinsics/ms_simd_instructions.h"
2344+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2345+
2346+#ifdef __cplusplus
2347+extern "C" {
2348+#endif
2349+#pragma GCC push_options
2350+#pragma GCC target("avx", "avx2")
2351+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2352+#define BLOCK_NUM 8
2353+#define MS_SIMD_AVX
2354+
2355+static inline int FillFp32AVX(int index, float *output, int size, float data) {
2356+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2357+    SIMD_ST_F32(output + index, SIMD_MOV_F32(data));
2358+  }
2359+  return index;
2360+}
2361+
2362+static inline int FillInt32AVX(int index, int *output, int size, int data) {
2363+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2364+    SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data));
2365+  }
2366+  return index;
2367+}
2368+
2369+#undef MS_SIMD_INSTRUCTION
2370+#undef BLOCK_NUM
2371+#pragma GCC pop_options
2372+#undef MS_SIMD_AVX
2373+#ifdef __cplusplus
2374+}
2375+#endif
2376+#endif
2377+
2378diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h
2379new file mode 100644
2380index 00000000..d5076e59
2381--- /dev/null
2382+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h
2383@@ -0,0 +1,77 @@
2384+/**
2385+ * Copyright 2022 Huawei Technologies Co., Ltd
2386+ *
2387+ * Licensed under the Apache License, Version 2.0 (the "License");
2388+ * you may not use this file except in compliance with the License.
2389+ * You may obtain a copy of the License at
2390+ *
2391+ * http://www.apache.org/licenses/LICENSE-2.0
2392+ *
2393+ * Unless required by applicable law or agreed to in writing, software
2394+ * distributed under the License is distributed on an "AS IS" BASIS,
2395+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2396+ * See the License for the specific language governing permissions and
2397+ * limitations under the License.
2398+ */
2399+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX_H_
2400+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX_H_
2401+
2402+#include "nnacl/intrinsics/ms_simd_instructions.h"
2403+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2404+
2405+#ifdef __cplusplus
2406+extern "C" {
2407+#endif
2408+#pragma GCC push_options
2409+#pragma GCC target("avx", "avx2")
2410+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2411+#define BLOCK_NUM 8
2412+#define MS_SIMD_AVX
2413+
2414+static inline int64_t GroupNormFp32AVX(int64_t index, const float *unit_input, float scale, float offset, float mean,
2415+  float var_sqrt, int unit, float *unit_output) {
2416+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
2417+  SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt);
2418+  SIMD_F32 scale_val = SIMD_MOV_F32(scale);
2419+  SIMD_F32 offset_val = SIMD_MOV_F32(offset);
2420+  for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2421+    SIMD_F32 input = SIMD_LD_F32(unit_input + index);
2422+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt);
2423+    SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val);
2424+    SIMD_ST_F32(unit_output + index, output);
2425+  }
2426+  return index;
2427+}
2428+
2429+static inline int64_t GroupNormReduceSumAVX(int64_t index, const float *in, float *sum, int unit) {
2430+  if (unit - index >= 4 * BLOCK_NUM) {
2431+    SIMD_F32 tmp = SIMD_MOV_F32(0);
2432+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2433+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index));
2434+    }
2435+    *sum += SIMD_GET_SUM_F32(tmp);
2436+  }
2437+  return index;
2438+}
2439+
2440+static inline int64_t GroupNormReduceVarAVX(int64_t index, const float *in, float mean, float *sum, int unit) {
2441+  if (unit - index >= 4 * BLOCK_NUM) {
2442+    SIMD_F32 mean_val = SIMD_MOV_F32(mean);
2443+    SIMD_F32 tmp = SIMD_MOV_F32(0);
2444+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2445+      SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val);
2446+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input));
2447+    }
2448+    *sum += SIMD_GET_SUM_F32(tmp);
2449+  }
2450+  return index;
2451+}
2452+
2453+#undef MS_SIMD_INSTRUCTION
2454+#undef BLOCK_NUM
2455+#pragma GCC pop_options
2456+#undef MS_SIMD_AVX
2457+#ifdef __cplusplus
2458+}
2459+#endif
2460+#endif
2461diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h
2462new file mode 100644
2463index 00000000..96fdf185
2464--- /dev/null
2465+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h
2466@@ -0,0 +1,68 @@
2467+/**
2468+ * Copyright 2022 Huawei Technologies Co., Ltd
2469+ *
2470+ * Licensed under the Apache License, Version 2.0 (the "License");
2471+ * you may not use this file except in compliance with the License.
2472+ * You may obtain a copy of the License at
2473+ *
2474+ * http://www.apache.org/licenses/LICENSE-2.0
2475+ *
2476+ * Unless required by applicable law or agreed to in writing, software
2477+ * distributed under the License is distributed on an "AS IS" BASIS,
2478+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2479+ * See the License for the specific language governing permissions and
2480+ * limitations under the License.
2481+ */
2482+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX_H_
2483+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX_H_
2484+
2485+#include "nnacl/intrinsics/ms_simd_instructions.h"
2486+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2487+
2488+#ifdef __cplusplus
2489+extern "C" {
2490+#endif
2491+#pragma GCC push_options
2492+#pragma GCC target("avx", "avx2")
2493+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2494+#define BLOCK_NUM 8
2495+#define MS_SIMD_AVX
2496+
2497+static inline int LayerNormMeanAndSquareAVX(int index, const float *src, int num, float *mean, float *square_mean) {
2498+  if (num >= 4 * BLOCK_NUM) {
2499+    SIMD_F32 sum_val = SIMD_SET0_F32;
2500+    SIMD_F32 square_sum_val = SIMD_SET0_F32;
2501+    for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2502+      SIMD_F32 value = SIMD_LD_F32(src + index);
2503+      SIMD_F32 square_value = SIMD_MUL_F32(value, value);
2504+      sum_val = SIMD_ADD_F32(sum_val, value);
2505+      square_sum_val = SIMD_ADD_F32(square_sum_val, square_value);
2506+    }
2507+    *mean += SIMD_GET_SUM_F32(sum_val);
2508+    *square_mean += SIMD_GET_SUM_F32(square_sum_val);
2509+  }
2510+  return index;
2511+}
2512+
2513+static inline int LayerNormGammaAndBetaAVX(int index, float *dst, const float *src, const float *gamma_data,
2514+  const float *beta_data, int num, const float mean, const float deno) {
2515+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
2516+  SIMD_F32 deno_val = SIMD_MOV_F32(deno);
2517+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2518+    SIMD_F32 value = SIMD_LD_F32(src + index);
2519+    SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val);
2520+    out_value = SIMD_MUL_F32(out_value, deno_val);
2521+    out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index));
2522+    SIMD_ST_F32(dst + index, out_value);
2523+  }
2524+  return index;
2525+}
2526+
2527+#undef MS_SIMD_INSTRUCTION
2528+#undef BLOCK_NUM
2529+#pragma GCC pop_options
2530+#undef MS_SIMD_AVX
2531+#ifdef __cplusplus
2532+}
2533+#endif
2534+#endif
2535diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h
2536new file mode 100644
2537index 00000000..523e120e
2538--- /dev/null
2539+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h
2540@@ -0,0 +1,93 @@
2541+/**
2542+ * Copyright 2022 Huawei Technologies Co., Ltd
2543+ *
2544+ * Licensed under the Apache License, Version 2.0 (the "License");
2545+ * you may not use this file except in compliance with the License.
2546+ * You may obtain a copy of the License at
2547+ *
2548+ * http://www.apache.org/licenses/LICENSE-2.0
2549+ *
2550+ * Unless required by applicable law or agreed to in writing, software
2551+ * distributed under the License is distributed on an "AS IS" BASIS,
2552+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2553+ * See the License for the specific language governing permissions and
2554+ * limitations under the License.
2555+ */
2556+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_AVX_H_
2557+#define MINDSPORE_NNACL_FP32_MATMUL_F32_AVX_H_
2558+
2559+#include "nnacl/intrinsics/ms_simd_instructions.h"
2560+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2561+
2562+#ifdef __cplusplus
2563+extern "C" {
2564+#endif
2565+#pragma GCC push_options
2566+#pragma GCC target("avx", "avx2")
2567+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2568+#define BLOCK_NUM 8
2569+#define MS_SIMD_AVX
2570+
2571+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6.
2572+static inline int64_t GemmIsNotPackAVX(int64_t index, const float *a, const float *b, float *c, const float *bias, int row,
2573+  int deep, int act_type) {
2574+  SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f);
2575+  SIMD_F32 up_threshold = SIMD_MOV_F32(6);
2576+  SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]);
2577+  SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]);
2578+  for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2579+    SIMD_F32 a_data = SIMD_LD_F32(a + index);
2580+    SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16);
2581+    if (act_type != 0) {
2582+      dst = SIMD_MAX_F32(dst, down_threshold);
2583+      if (act_type == 3) {
2584+        dst = SIMD_MIN_F32(dst, up_threshold);
2585+      }
2586+    }
2587+    SIMD_ST_F32(c + index, dst);
2588+  }
2589+
2590+  return index;
2591+}
2592+
2593+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX)
2594+static inline int64_t GemmIsNotPackOptimizeCoreAVX(int64_t index, const float *a, const float *b, int k, float *dst) {
2595+  SIMD_F32 dst1 = SIMD_MOV_F32(0.0f);
2596+  for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2597+    SIMD_F32 weight = SIMD_LD_F32(b + index);
2598+    SIMD_F32 a1 = SIMD_LD_F32(a + index);
2599+    dst1 = SIMD_FMADD_F32(weight, a1, dst1);
2600+  }
2601+  *dst += SIMD_REDUCE_ADD_F32(dst1);
2602+  return index;
2603+}
2604+#endif
2605+
2606+static inline int64_t MatVecMulNoPackCoreAVX(int64_t oc_index, const float *a, const float *b, float *c, const float *bias,
2607+  int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) {
2608+  for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) {
2609+    SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index));
2610+    for (int64_t k = 0; k < depth; ++k) {
2611+      SIMD_F32 left = SIMD_MOV_F32(a[k]);
2612+      SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col);
2613+      out = SIMD_FMADD_F32(left, right, out);
2614+    }
2615+    if ((inc_flag & 0x2) != 0 && act_type != 0) {
2616+      out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f));
2617+      if (act_type == 0x3) {
2618+        out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f));
2619+      }
2620+    }
2621+    SIMD_ST_F32(c + oc_index, out);
2622+  }
2623+  return oc_index;
2624+}
2625+
2626+#undef MS_SIMD_INSTRUCTION
2627+#undef BLOCK_NUM
2628+#pragma GCC pop_options
2629+#undef MS_SIMD_AVX
2630+#ifdef __cplusplus
2631+}
2632+#endif
2633+#endif
2634diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h
2635new file mode 100644
2636index 00000000..a5d8b0a0
2637--- /dev/null
2638+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h
2639@@ -0,0 +1,218 @@
2640+/**
2641+ * Copyright 2022 Huawei Technologies Co., Ltd
2642+ *
2643+ * Licensed under the Apache License, Version 2.0 (the "License");
2644+ * you may not use this file except in compliance with the License.
2645+ * You may obtain a copy of the License at
2646+ *
2647+ * http://www.apache.org/licenses/LICENSE-2.0
2648+ *
2649+ * Unless required by applicable law or agreed to in writing, software
2650+ * distributed under the License is distributed on an "AS IS" BASIS,
2651+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2652+ * See the License for the specific language governing permissions and
2653+ * limitations under the License.
2654+ */
2655+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
2656+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_
2657+
2658+#include "nnacl/intrinsics/ms_simd_instructions.h"
2659+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2660+
2661+#ifdef __cplusplus
2662+extern "C" {
2663+#endif
2664+#pragma GCC push_options
2665+#pragma GCC target("avx", "avx2")
2666+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2667+#define BLOCK_NUM 8
2668+#define MS_SIMD_AVX
2669+
2670+static inline int ElementMulAVX(int index, const float *in0, const float *in1, float *out, int size) {
2671+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2672+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2673+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2674+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1);
2675+    SIMD_ST_F32(out + index, vout);
2676+  }
2677+  return index;
2678+}
2679+
2680+static inline int ElementMulReluAVX(int index, const float *in0, const float *in1, float *out, int size) {
2681+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2682+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2683+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2684+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f);
2685+    SIMD_ST_F32(out + index, vout);
2686+  }
2687+  return index;
2688+}
2689+
2690+static inline int ElementMulRelu6AVX(int index, const float *in0, const float *in1, float *out, int size) {
2691+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2692+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2693+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2694+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f);
2695+    SIMD_ST_F32(out + index, vout);
2696+  }
2697+  return index;
2698+}
2699+
2700+static inline int ElementMulIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
2701+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2702+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
2703+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
2704+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1);
2705+    SIMD_ST_EPI32(out + index, vout);
2706+  }
2707+  return index;
2708+}
2709+
2710+static inline int ElementMulReluIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
2711+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2712+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
2713+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
2714+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f);
2715+    SIMD_ST_EPI32(out + index, vout);
2716+  }
2717+  return index;
2718+}
2719+
2720+static inline int ElementMulRelu6IntAVX(int index, const int *in0, const int *in1, int *out, int size) {
2721+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2722+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
2723+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
2724+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f);
2725+    SIMD_ST_EPI32(out + index, vout);
2726+  }
2727+  return index;
2728+}
2729+
2730+static inline int ElementOptMulNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
2731+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
2732+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2733+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2734+    SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1);
2735+    SIMD_ST_F32(out + index, vout);
2736+  }
2737+  return index;
2738+}
2739+
2740+static inline int ElementOptMulNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
2741+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
2742+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2743+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2744+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_);
2745+    SIMD_ST_F32(out + index, vout);
2746+  }
2747+  return index;
2748+}
2749+
2750+static inline int ElementOptMulReluNum0AVX(int index, const float *in0, const float *in1, float *out, int size) {
2751+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
2752+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2753+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2754+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f);
2755+    SIMD_ST_F32(out + index, vout);
2756+  }
2757+  return index;
2758+}
2759+
2760+static inline int ElementOptMulReluNum1AVX(int index, const float *in0, const float *in1, float *out, int size) {
2761+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
2762+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2763+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2764+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f);
2765+    SIMD_ST_F32(out + index, vout);
2766+  }
2767+  return index;
2768+}
2769+
2770+static inline int ElementOptMulRelu6Num0AVX(int index, const float *in0, const float *in1, float *out, int size) {
2771+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
2772+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2773+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
2774+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f);
2775+    SIMD_ST_F32(out + index, vout);
2776+  }
2777+  return index;
2778+}
2779+
2780+static inline int ElementOptMulRelu6Num1AVX(int index, const float *in0, const float *in1, float *out, int size) {
2781+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
2782+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2783+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
2784+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f);
2785+    SIMD_ST_F32(out + index, vout);
2786+  }
2787+  return index;
2788+}
2789+
2790+static inline int ElementOptMulIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
2791+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
2792+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2793+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
2794+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1);
2795+    SIMD_ST_EPI32(out + index, vout);
2796+  }
2797+  return index;
2798+}
2799+
2800+static inline int ElementOptMulIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
2801+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
2802+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2803+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
2804+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_);
2805+    SIMD_ST_EPI32(out + index, vout);
2806+  }
2807+  return index;
2808+}
2809+
2810+static inline int ElementOptMulReluIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
2811+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
2812+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2813+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
2814+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f);
2815+    SIMD_ST_EPI32(out + index, vout);
2816+  }
2817+  return index;
2818+}
2819+
2820+static inline int ElementOptMulReluIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
2821+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
2822+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2823+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
2824+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f);
2825+    SIMD_ST_EPI32(out + index, vout);
2826+  }
2827+  return index;
2828+}
2829+
2830+static inline int ElementOptMulRelu6IntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
2831+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
2832+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2833+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
2834+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f);
2835+    SIMD_ST_EPI32(out + index, vout);
2836+  }
2837+  return index;
2838+}
2839+
2840+static inline int ElementOptMulRelu6IntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
2841+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
2842+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2843+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
2844+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f);
2845+    SIMD_ST_EPI32(out + index, vout);
2846+  }
2847+  return index;
2848+}
2849+
2850+#undef MS_SIMD_INSTRUCTION
2851+#undef BLOCK_NUM
2852+#pragma GCC pop_options
2853+#undef MS_SIMD_AVX
2854+#ifdef __cplusplus
2855+}
2856+#endif
2857+#endif
2858diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h
2859new file mode 100644
2860index 00000000..d4bd2305
2861--- /dev/null
2862+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h
2863@@ -0,0 +1,84 @@
2864+/**
2865+ * Copyright 2022 Huawei Technologies Co., Ltd
2866+ *
2867+ * Licensed under the Apache License, Version 2.0 (the "License");
2868+ * you may not use this file except in compliance with the License.
2869+ * You may obtain a copy of the License at
2870+ *
2871+ * http://www.apache.org/licenses/LICENSE-2.0
2872+ *
2873+ * Unless required by applicable law or agreed to in writing, software
2874+ * distributed under the License is distributed on an "AS IS" BASIS,
2875+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2876+ * See the License for the specific language governing permissions and
2877+ * limitations under the License.
2878+ */
2879+#ifndef MINDSPORE_NNACL_FP32_POOLING_AVX_H_
2880+#define MINDSPORE_NNACL_FP32_POOLING_AVX_H_
2881+
2882+#include "nnacl/intrinsics/ms_simd_instructions.h"
2883+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2884+
2885+#ifdef __cplusplus
2886+extern "C" {
2887+#endif
2888+#pragma GCC push_options
2889+#pragma GCC target("avx", "avx2")
2890+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2891+#define BLOCK_NUM 8
2892+#define MS_SIMD_AVX
2893+
2894+static inline int AvgPoolingBatchAVX(int ci, const float *src_plane_ptr, int channel,
2895+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
2896+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
2897+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
2898+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
2899+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
2900+    const float *src_c_ptr = src_plane_ptr + ci;
2901+    float *dst_c_ptr = dst_plane_ptr + ci;
2902+    SIMD_F32 tmp_avg = SIMD_SET0_F32;
2903+    int real_count = 0;
2904+    for (int h = real_win_h_start; h < real_win_h_end; h++) {
2905+      for (int w = real_win_w_start; w < real_win_w_end; w++) {
2906+        const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
2907+        tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr));
2908+        ++real_count;
2909+      }
2910+    }
2911+    tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count));
2912+    tmp_avg = SIMD_MAX_F32(tmp_avg, min_val);
2913+    tmp_avg = SIMD_MIN_F32(tmp_avg, max_val);
2914+    SIMD_ST_F32(dst_c_ptr, tmp_avg);
2915+  }
2916+  return ci;
2917+}
2918+
2919+static inline int MaxPoolingBatchAVX(int ci, const float *src_plane_ptr, int channel,
2920+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
2921+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
2922+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
2923+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
2924+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
2925+    const float *src_c_ptr = src_plane_ptr + ci;
2926+    float *dst_c_ptr = dst_plane_ptr + ci;
2927+    SIMD_F32 tmp_max = min_val;
2928+    for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
2929+      for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
2930+        const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
2931+        tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr));
2932+      }
2933+    }
2934+    tmp_max = SIMD_MIN_F32(tmp_max, max_val);
2935+    SIMD_ST_F32(dst_c_ptr, tmp_max);
2936+  }
2937+  return ci;
2938+}
2939+
2940+#undef MS_SIMD_INSTRUCTION
2941+#undef BLOCK_NUM
2942+#pragma GCC pop_options
2943+#undef MS_SIMD_AVX
2944+#ifdef __cplusplus
2945+}
2946+#endif
2947+#endif
2948diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h
2949new file mode 100644
2950index 00000000..2ada6cb3
2951--- /dev/null
2952+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h
2953@@ -0,0 +1,101 @@
2954+/**
2955+ * Copyright 2022 Huawei Technologies Co., Ltd
2956+ *
2957+ * Licensed under the Apache License, Version 2.0 (the "License");
2958+ * you may not use this file except in compliance with the License.
2959+ * You may obtain a copy of the License at
2960+ *
2961+ * http://www.apache.org/licenses/LICENSE-2.0
2962+ *
2963+ * Unless required by applicable law or agreed to in writing, software
2964+ * distributed under the License is distributed on an "AS IS" BASIS,
2965+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2966+ * See the License for the specific language governing permissions and
2967+ * limitations under the License.
2968+ */
2969+#ifndef MINDSPORE_NNACL_FP32_POWER_AVX_H_
2970+#define MINDSPORE_NNACL_FP32_POWER_AVX_H_
2971+
2972+#include "nnacl/intrinsics/ms_simd_instructions.h"
2973+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
2974+
2975+#ifdef __cplusplus
2976+extern "C" {
2977+#endif
2978+#pragma GCC push_options
2979+#pragma GCC target("avx", "avx2")
2980+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
2981+#define BLOCK_NUM 8
2982+#define MS_SIMD_AVX
2983+
2984+static inline int PowerBroadCastIntExponentAVX(int index, const float *input, int exponent, float *output, int len,
2985+  float scale, float shift) {
2986+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
2987+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
2988+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
2989+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
2990+    SIMD_F32 result = SIMD_MOV_F32(1.0f);
2991+    int exp = abs(exponent);
2992+    while (exp) {
2993+      if (exp % 2) {
2994+        result = SIMD_MUL_F32(result, tmp);
2995+      }
2996+      tmp = SIMD_MUL_SQUARE_F32(tmp);
2997+      exp = exp / 2;
2998+    }
2999+    SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result));
3000+  }
3001+  return index;
3002+}
3003+
3004+static inline int PowerBroadCastFloatExponentAVX(int index, const float *input, float exponent, float *output, int len,
3005+  float scale, float shift) {
3006+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
3007+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
3008+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3009+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
3010+    SIMD_F32 result;
3011+    for (int i = 0; i < BLOCK_NUM; ++i) {
3012+      SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent);
3013+    }
3014+    SIMD_ST_F32(output + index, result);
3015+  }
3016+  return index;
3017+}
3018+
3019+static inline int PowerSingleExponentAVX(int index, const float *input, const float *exponent, float *output, int len,
3020+  float scale, float shift) {
3021+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
3022+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
3023+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3024+    SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
3025+    for (int j = 0; j < BLOCK_NUM; ++j) {
3026+      float cur_exponent = exponent[index + j];
3027+      float cur_val = SIMD_F32_GETI(tmp_vec, j);
3028+      if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) {
3029+        int exp = abs((int)(cur_exponent));
3030+        float result = 1;
3031+        while (exp) {
3032+          if (exp % 2) {
3033+            result *= cur_val;
3034+          }
3035+          cur_val *= cur_val;
3036+          exp = exp / 2;
3037+        }
3038+        output[index + j] = *exponent >= 0 ? result : 1 / result;
3039+      } else {
3040+        output[index + j] = powf(cur_val, cur_exponent);
3041+      }
3042+    }
3043+  }
3044+  return index;
3045+}
3046+
3047+#undef MS_SIMD_INSTRUCTION
3048+#undef BLOCK_NUM
3049+#pragma GCC pop_options
3050+#undef MS_SIMD_AVX
3051+#ifdef __cplusplus
3052+}
3053+#endif
3054+#endif
3055diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h
3056new file mode 100644
3057index 00000000..03339e42
3058--- /dev/null
3059+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h
3060@@ -0,0 +1,181 @@
3061+/**
3062+ * Copyright 2022 Huawei Technologies Co., Ltd
3063+ *
3064+ * Licensed under the Apache License, Version 2.0 (the "License");
3065+ * you may not use this file except in compliance with the License.
3066+ * You may obtain a copy of the License at
3067+ *
3068+ * http://www.apache.org/licenses/LICENSE-2.0
3069+ *
3070+ * Unless required by applicable law or agreed to in writing, software
3071+ * distributed under the License is distributed on an "AS IS" BASIS,
3072+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
3073+ * See the License for the specific language governing permissions and
3074+ * limitations under the License.
3075+ */
3076+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX_H_
3077+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX_H_
3078+
3079+#include "nnacl/intrinsics/ms_simd_instructions.h"
3080+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
3081+
3082+#ifdef __cplusplus
3083+extern "C" {
3084+#endif
3085+#pragma GCC push_options
3086+#pragma GCC target("avx", "avx2")
3087+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
3088+#define BLOCK_NUM 8
3089+#define MS_SIMD_AVX
3090+
3091+static inline int64_t ReduceSumAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
3092+  int axis_size) {
3093+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3094+    const float *inner_src = outer_src + index;
3095+    SIMD_F32 tmp = SIMD_MOV_F32(0);
3096+    for (int i = 0; i < axis_size; i++) {
3097+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
3098+    }
3099+    SIMD_ST_F32(outer_dst + index, tmp);
3100+  }
3101+  return index;
3102+}
3103+
3104+static inline int64_t ReduceMeanAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
3105+  int axis_size) {
3106+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3107+    const float *inner_src = outer_src + index;
3108+    SIMD_F32 tmp = SIMD_MOV_F32(0);
3109+    for (int i = 0; i < axis_size; i++) {
3110+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
3111+    }
3112+    SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size));
3113+  }
3114+  return index;
3115+}
3116+
3117+static inline int64_t ReduceMinAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
3118+  int axis_size) {
3119+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3120+    const float *inner_src = outer_src + index;
3121+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX);
3122+    for (int i = 0; i < axis_size; i++) {
3123+      tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
3124+    }
3125+    SIMD_ST_F32(outer_dst + index, tmp);
3126+  }
3127+  return index;
3128+}
3129+
3130+static inline int64_t ReduceMaxAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
3131+  int axis_size) {
3132+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3133+    const float *inner_src = outer_src + index;
3134+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN);
3135+    for (int i = 0; i < axis_size; i++) {
3136+      tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
3137+    }
3138+    SIMD_ST_F32(outer_dst + index, tmp);
3139+  }
3140+  return index;
3141+}
3142+
3143+static inline int64_t ReduceProdAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
3144+  int axis_size) {
3145+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3146+    const float *inner_src = outer_src + index;
3147+    SIMD_F32 tmp = SIMD_MOV_F32(1.0f);
3148+    for (int i = 0; i < axis_size; i++) {
3149+      tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
3150+    }
3151+    SIMD_ST_F32(outer_dst + index, tmp);
3152+  }
3153+  return index;
3154+}
3155+
3156+static inline int64_t ReduceSumSquareAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
3157+  int axis_size) {
3158+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3159+    const float *inner_src = outer_src + index;
3160+    SIMD_F32 tmp = SIMD_MOV_F32(0);
3161+    for (int i = 0; i < axis_size; i++) {
3162+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
3163+    }
3164+    SIMD_ST_F32(outer_dst + index, tmp);
3165+  }
3166+  return index;
3167+}
3168+
3169+static inline int64_t ReduceL2NormAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
3170+  int axis_size) {
3171+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3172+    const float *inner_src = outer_src + index;
3173+    SIMD_F32 tmp = SIMD_MOV_F32(0);
3174+    for (int i = 0; i < axis_size; i++) {
3175+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
3176+    }
3177+    SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp));
3178+  }
3179+  return index;
3180+}
3181+
3182+static inline int64_t IntReduceSumAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
3183+  int axis_size) {
3184+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3185+    const int *inner_src = outer_src + index;
3186+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
3187+    for (int i = 0; i < axis_size; i++) {
3188+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
3189+    }
3190+    SIMD_ST_EPI32(outer_dst + index, tmp);
3191+  }
3192+  return index;
3193+}
3194+
3195+static inline int64_t IntReduceMeanAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
3196+  int axis_size) {
3197+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3198+    const int *inner_src = outer_src + index;
3199+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
3200+    for (int i = 0; i < axis_size; i++) {
3201+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
3202+    }
3203+    SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size));
3204+  }
3205+  return index;
3206+}
3207+
3208+static inline int64_t IntReduceMinAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
3209+  int axis_size) {
3210+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3211+    const int *inner_src = outer_src + index;
3212+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX);
3213+    for (int i = 0; i < axis_size; i++) {
3214+      tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
3215+    }
3216+    SIMD_ST_EPI32(outer_dst + index, tmp);
3217+  }
3218+  return index;
3219+}
3220+
3221+static inline int64_t IntReduceMaxAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
3222+  int axis_size) {
3223+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3224+    const int *inner_src = outer_src + index;
3225+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN);
3226+    for (int i = 0; i < axis_size; i++) {
3227+      tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
3228+    }
3229+    SIMD_ST_EPI32(outer_dst + index, tmp);
3230+  }
3231+  return index;
3232+}
3233+
3234+#undef MS_SIMD_INSTRUCTION
3235+#undef BLOCK_NUM
3236+#pragma GCC pop_options
3237+#undef MS_SIMD_AVX
3238+#ifdef __cplusplus
3239+}
3240+#endif
3241+#endif
3242diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h
3243new file mode 100644
3244index 00000000..8229111d
3245--- /dev/null
3246+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h
3247@@ -0,0 +1,87 @@
3248+/**
3249+ * Copyright 2022 Huawei Technologies Co., Ltd
3250+ *
3251+ * Licensed under the Apache License, Version 2.0 (the "License");
3252+ * you may not use this file except in compliance with the License.
3253+ * You may obtain a copy of the License at
3254+ *
3255+ * http://www.apache.org/licenses/LICENSE-2.0
3256+ *
3257+ * Unless required by applicable law or agreed to in writing, software
3258+ * distributed under the License is distributed on an "AS IS" BASIS,
3259+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
3260+ * See the License for the specific language governing permissions and
3261+ * limitations under the License.
3262+ */
3263+
3264+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_AVX_H_
3265+#define MINDSPORE_NNACL_FP32_SOFTMAX_AVX_H_
3266+
3267+#include "nnacl/intrinsics/ms_simd_instructions.h"
3268+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
3269+
3270+#ifdef __cplusplus
3271+extern "C" {
3272+#endif
3273+#pragma GCC push_options
3274+#pragma GCC target("avx", "avx2")
3275+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
3276+#define BLOCK_NUM 8
3277+#define MS_SIMD_AVX
3278+
3279+static inline int64_t SoftmaxNormGetMaxAVX(int64_t index, const float *src, int cur_batch_offset,
3280+  float *max, int channel) {
3281+  if (channel >= BLOCK_NUM * BLOCK_NUM) {
3282+    SIMD_F32 max_val = SIMD_MOV_F32(*max);
3283+    for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3284+      max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index));
3285+    }
3286+    *max = SIMD_GET_MAX_F32(max_val);
3287+  }
3288+  return index;
3289+}
3290+
3291+static inline int64_t SoftmaxNormCalcNormAVX(int64_t index, const float *src, float *dst,
3292+  int cur_batch_offset, float max, int channel) {
3293+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3294+    SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max));
3295+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
3296+  }
3297+  return index;
3298+}
3299+
3300+static inline int64_t SoftmaxLastAxisGetExpSumAVX(int64_t index, const float *src, float *dst,
3301+  int cur_batch_offset, float max, float *exp_sum, int channel) {
3302+#ifndef _WIN32
3303+  SIMD_F32 sum_val = SIMD_SET0_F32;
3304+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3305+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
3306+    SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max));
3307+    SIMD_F32 exp_out = SIMD_EXP_F32(output);
3308+    sum_val = SIMD_ADD_F32(sum_val, exp_out);
3309+    SIMD_ST_F32(dst + cur_batch_offset + index, exp_out);
3310+  }
3311+  *exp_sum += SIMD_GET_SUM_F32(sum_val);
3312+#endif
3313+  return index;
3314+}
3315+
3316+static inline int64_t SoftmaxLastAxisGetResultAVX(int64_t index, const float *src, float *dst,
3317+  int cur_batch_offset, float exp_sum, int channel) {
3318+  SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum);
3319+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3320+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
3321+    SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val);
3322+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
3323+  }
3324+  return index;
3325+}
3326+
3327+#undef MS_SIMD_INSTRUCTION
3328+#undef BLOCK_NUM
3329+#pragma GCC pop_options
3330+#undef MS_SIMD_AVX
3331+#ifdef __cplusplus
3332+};
3333+#endif
3334+#endif
3335diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h
3336new file mode 100644
3337index 00000000..a3ed93d4
3338--- /dev/null
3339+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h
3340@@ -0,0 +1,167 @@
3341+/**
3342+ * Copyright 2022 Huawei Technologies Co., Ltd
3343+ *
3344+ * Licensed under the Apache License, Version 2.0 (the "License");
3345+ * you may not use this file except in compliance with the License.
3346+ * You may obtain a copy of the License at
3347+ *
3348+ * http://www.apache.org/licenses/LICENSE-2.0
3349+ *
3350+ * Unless required by applicable law or agreed to in writing, software
3351+ * distributed under the License is distributed on an "AS IS" BASIS,
3352+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
3353+ * See the License for the specific language governing permissions and
3354+ * limitations under the License.
3355+ */
3356+
3357+#ifndef MINDSPORE_NNACL_FP32_SUB_AVX_H_
3358+#define MINDSPORE_NNACL_FP32_SUB_AVX_H_
3359+
3360+#include "nnacl/intrinsics/ms_simd_instructions.h"
3361+#include "nnacl/intrinsics/ms_simd_avx_instructions.h"
3362+
3363+#ifdef __cplusplus
3364+extern "C" {
3365+#endif
3366+#pragma GCC push_options
3367+#pragma GCC target("avx", "avx2")
3368+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION
3369+#define BLOCK_NUM 8
3370+#define MS_SIMD_AVX
3371+
3372+static inline int ElementOptSubNum0AVX(int index, const float *in0, const float *in1, float *out,
3373+                                                      int size) {
3374+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
3375+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3376+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
3377+    SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1);
3378+    SIMD_ST_F32(out + index, vout);
3379+  }
3380+  return index;
3381+}
3382+
3383+static inline int ElementOptSubNum1AVX(int index, const float *in0, const float *in1, float *out,
3384+                                                      int size) {
3385+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
3386+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3387+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
3388+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_);
3389+    SIMD_ST_F32(out + index, vout);
3390+  }
3391+  return index;
3392+}
3393+
3394+static inline int ElementOptSubIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) {
3395+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
3396+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3397+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
3398+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1);
3399+    SIMD_ST_EPI32(out + index, vout);
3400+  }
3401+  return index;
3402+}
3403+
3404+static inline int ElementOptSubIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) {
3405+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
3406+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3407+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
3408+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_);
3409+    SIMD_ST_EPI32(out + index, vout);
3410+  }
3411+  return index;
3412+}
3413+
3414+static inline int ElementOptSubReluNum0AVX(int index, const float *in0, const float *in1, float *out,
3415+                                                          int size) {
3416+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
3417+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3418+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
3419+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f);
3420+    SIMD_ST_F32(out + index, vout);
3421+  }
3422+  return index;
3423+}
3424+
3425+static inline int ElementOptSubReluNum1AVX(int index, const float *in0, const float *in1, float *out,
3426+                                                          int size) {
3427+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
3428+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3429+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
3430+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f);
3431+    SIMD_ST_F32(out + index, vout);
3432+  }
3433+  return index;
3434+}
3435+
3436+static inline int ElementOptSubRelu6Num0AVX(int index, const float *in0, const float *in1, float *out,
3437+                                                           int size) {
3438+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
3439+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3440+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
3441+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f);
3442+    SIMD_ST_F32(out + index, vout);
3443+  }
3444+  return index;
3445+}
3446+
3447+static inline int ElementOptSubRelu6Num1AVX(int index, const float *in0, const float *in1, float *out,
3448+                                                           int size) {
3449+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
3450+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3451+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
3452+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f);
3453+    SIMD_ST_F32(out + index, vout);
3454+  }
3455+  return index;
3456+}
3457+
3458+static inline int ElementSubAVX(int index, const float *in0, const float *in1, float *out, int size) {
3459+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3460+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
3461+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
3462+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1);
3463+    SIMD_ST_F32(out + index, vout);
3464+  }
3465+  return index;
3466+}
3467+
3468+static inline int ElementSubIntAVX(int index, const int *in0, const int *in1, int *out, int size) {
3469+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3470+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
3471+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
3472+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1);
3473+    SIMD_ST_EPI32(out + index, vout);
3474+  }
3475+  return index;
3476+}
3477+
3478+static inline int ElementSubReluAVX(int index, const float *in0, const float *in1, float *out,
3479+                                                   int size) {
3480+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3481+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
3482+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
3483+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f);
3484+    SIMD_ST_F32(out + index, vout);
3485+  }
3486+  return index;
3487+}
3488+
3489+static inline int ElementSubRelu6AVX(int index, const float *in0, const float *in1, float *out,
3490+                                                    int size) {
3491+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3492+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
3493+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
3494+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f);
3495+    SIMD_ST_F32(out + index, vout);
3496+  }
3497+  return index;
3498+}
3499+
3500+#undef MS_SIMD_INSTRUCTION
3501+#undef BLOCK_NUM
3502+#pragma GCC pop_options
3503+#undef MS_SIMD_AVX
3504+#ifdef __cplusplus
3505+};
3506+#endif
3507+#endif
3508diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h
3509new file mode 100644
3510index 00000000..f6457628
3511--- /dev/null
3512+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h
3513@@ -0,0 +1,221 @@
3514+/**
3515+ * Copyright 2022 Huawei Technologies Co., Ltd
3516+ *
3517+ * Licensed under the Apache License, Version 2.0 (the "License");
3518+ * you may not use this file except in compliance with the License.
3519+ * You may obtain a copy of the License at
3520+ *
3521+ * http://www.apache.org/licenses/LICENSE-2.0
3522+ *
3523+ * Unless required by applicable law or agreed to in writing, software
3524+ * distributed under the License is distributed on an "AS IS" BASIS,
3525+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
3526+ * See the License for the specific language governing permissions and
3527+ * limitations under the License.
3528+ */
3529+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
3530+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
3531+
3532+#include "nnacl/intrinsics/ms_simd_instructions.h"
3533+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
3534+
3535+#ifdef __cplusplus
3536+extern "C" {
3537+#endif
3538+#pragma GCC push_options
3539+#pragma GCC target("avx512f")
3540+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
3541+#define BLOCK_NUM 16
3542+#define MS_SIMD_AVX512
3543+
3544+static inline int Fp32ReluAVX512(int index, const float *src, int length, float *dst) {
3545+    SIMD_F32 zero = SIMD_SET0_F32;
3546+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3547+        SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero));
3548+    }
3549+    return index;
3550+}
3551+
3552+static inline int Int32ReluAVX512(int index, const int32_t *src, int length, int32_t *dst) {
3553+    SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f);
3554+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3555+        SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero));
3556+    }
3557+    return index;
3558+}
3559+
3560+static inline int Fp32Relu6AVX512(int index, const float *src, int length, float *dst) {
3561+    SIMD_F32 zero = SIMD_SET0_F32;
3562+    SIMD_F32 six = SIMD_MOV_F32(6.0f);
3563+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3564+        SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six));
3565+    }
3566+    return index;
3567+}
3568+
3569+static inline int LReluAVX512(int index, const float *src, int length, float *dst, float alpha) {
3570+    SIMD_F32 alpha_data = SIMD_MOV_F32(alpha);
3571+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3572+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
3573+        SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp);
3574+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask));
3575+    }
3576+    return index;
3577+}
3578+
3579+static inline int SigmoidAVX512(int index, const float *src, int length, float *dst) {
3580+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3581+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index);
3582+        SIMD_ST_F32(dst + index,
3583+                    SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
3584+    }
3585+    return index;
3586+}
3587+
3588+static inline int TanhAVX512(int index, const float *src, int length, float *dst) {
3589+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3590+        SIMD_F32 input = SIMD_LD_F32(src + index);
3591+        SIMD_ST_F32(dst + index, SIMD_TANH_F32(input));
3592+    }
3593+    return index;
3594+}
3595+
3596+static inline int SwishAVX512(int index, const float *src, int length, float *dst) {
3597+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3598+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
3599+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index);
3600+        SIMD_ST_F32(dst + index,
3601+                    SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
3602+    }
3603+    return index;
3604+}
3605+
3606+static inline int HSwishAVX512(int index, const float *src, int length, float *dst) {
3607+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3608+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
3609+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
3610+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6));
3611+    }
3612+    return index;
3613+}
3614+
3615+static inline int HSigmoidAVX512(int index, const float *src, int length, float *dst) {
3616+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3617+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
3618+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
3619+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6));
3620+    }
3621+    return index;
3622+}
3623+
3624+static inline int HardTanhNoLimitMinAVX512(int index, const float *src, int length, float *dst, float min_val,
3625+                                            float max_val) {
3626+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3627+        SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val));
3628+    }
3629+    return index;
3630+}
3631+
3632+static inline int HardTanhNoLimitMaxAVX512(int index, const float *src, int length, float *dst, float min_val,
3633+                                            float max_val) {
3634+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3635+        SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val));
3636+    }
3637+    return index;
3638+}
3639+
3640+static inline int HardTanhLimitMinMaxAVX512(int index, const float *src, int length, float *dst, float min_val,
3641+                                             float max_val) {
3642+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3643+        SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val));
3644+    }
3645+    return index;
3646+}
3647+
3648+static inline int GeluApproximateAVX512(int index, const float *src, int length, float *dst) {
3649+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3650+        SIMD_F32 in = SIMD_LD_F32(src + index);
3651+        SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in);
3652+        SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in);
3653+        SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f)));
3654+    }
3655+    return index;
3656+}
3657+
3658+static inline int GeluAVX512(int index, const float *src, int length, float *dst) {
3659+    SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f);
3660+    SIMD_F32 para2 = SIMD_MOV_F32(1.0f);
3661+    SIMD_F32 para3 = SIMD_MOV_F32(0.5f);
3662+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3663+      SIMD_F32 in = SIMD_LD_F32(src + index);
3664+      SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1))));
3665+      SIMD_ST_F32(dst + index, res);
3666+    }
3667+    return index;
3668+}
3669+
3670+static inline int EluAVX512(int index, const float *src, int length, float *dst, float alpha) {
3671+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3672+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
3673+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f);
3674+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
3675+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
3676+    }
3677+    return index;
3678+}
3679+
3680+static inline int CeluAVX512(int index, const float *src, int length, float *dst, float alpha) {
3681+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3682+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
3683+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f);
3684+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
3685+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
3686+    }
3687+    return index;
3688+}
3689+
3690+static inline int HShrinkAVX512(int index, const float *src, int length, float *dst, float lambd) {
3691+    const float neg_lambd = -1 * lambd;
3692+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3693+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
3694+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd));
3695+        SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp);
3696+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
3697+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask));
3698+    }
3699+    return index;
3700+}
3701+
3702+static inline int SoftShrinkAVX512(int index, const float *src, int length, float *dst, float lambd) {
3703+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
3704+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
3705+
3706+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3707+        SIMD_F32 src_t = SIMD_LD_F32(src + index);
3708+        /* v0 = (in > lamdb) & (in - lamdb) */
3709+        SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v));
3710+        /* v1 = (in < -lamdb) & (in + lamdb) */
3711+        SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v));
3712+        /* out = (v0 | v1) */
3713+        SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1));
3714+    }
3715+    return index;
3716+}
3717+
3718+static inline int SoftsignFp32OptAVX512(int index, const float *src, int length, float *dst) {
3719+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3720+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
3721+        SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp));
3722+        SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp));
3723+    }
3724+    return index;
3725+}
3726+
3727+#undef MS_SIMD_INSTRUCTION
3728+#undef BLOCK_NUM
3729+#pragma GCC pop_options
3730+#undef MS_SIMD_AVX512
3731+#ifdef __cplusplus
3732+}
3733+#endif
3734+#endif
3735diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h
3736new file mode 100644
3737index 00000000..62d34db4
3738--- /dev/null
3739+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h
3740@@ -0,0 +1,57 @@
3741+/**
3742+ * Copyright 2022 Huawei Technologies Co., Ltd
3743+ *
3744+ * Licensed under the Apache License, Version 2.0 (the "License");
3745+ * you may not use this file except in compliance with the License.
3746+ * You may obtain a copy of the License at
3747+ *
3748+ * http://www.apache.org/licenses/LICENSE-2.0
3749+ *
3750+ * Unless required by applicable law or agreed to in writing, software
3751+ * distributed under the License is distributed on an "AS IS" BASIS,
3752+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
3753+ * See the License for the specific language governing permissions and
3754+ * limitations under the License.
3755+ */
3756+
3757+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX512_H_
3758+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX512_H_
3759+
3760+#include "nnacl/intrinsics/ms_simd_instructions.h"
3761+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
3762+
3763+#ifdef __cplusplus
3764+extern "C" {
3765+#endif
3766+#pragma GCC push_options
3767+#pragma GCC target("avx512f")
3768+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
3769+#define BLOCK_NUM 16
3770+#define MS_SIMD_AVX512
3771+
3772+static inline int ShrinkGradAVX512(int index, const float *src0, const float *src1,
3773+                                               int length, float *dst, float lambd) {
3774+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
3775+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
3776+
3777+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3778+        SIMD_F32 src0_t = SIMD_LD_F32(src0 + index);
3779+        SIMD_F32 src1_t = SIMD_LD_F32(src1 + index);
3780+
3781+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v);
3782+        SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t);
3783+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
3784+
3785+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask));
3786+    }
3787+    return index;
3788+}
3789+
3790+#undef MS_SIMD_INSTRUCTION
3791+#undef BLOCK_NUM
3792+#pragma GCC pop_options
3793+#undef MS_SIMD_AVX512
3794+#ifdef __cplusplus
3795+}
3796+#endif
3797+#endif
3798diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h
3799new file mode 100644
3800index 00000000..0579d58a
3801--- /dev/null
3802+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h
3803@@ -0,0 +1,210 @@
3804+/**
3805+ * Copyright 2022 Huawei Technologies Co., Ltd
3806+ *
3807+ * Licensed under the Apache License, Version 2.0 (the "License");
3808+ * you may not use this file except in compliance with the License.
3809+ * You may obtain a copy of the License at
3810+ *
3811+ * http://www.apache.org/licenses/LICENSE-2.0
3812+ *
3813+ * Unless required by applicable law or agreed to in writing, software
3814+ * distributed under the License is distributed on an "AS IS" BASIS,
3815+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
3816+ * See the License for the specific language governing permissions and
3817+ * limitations under the License.
3818+ */
3819+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_AVX512_H_
3820+#define MINDSPORE_NNACL_FP32_ADAM_FP32_AVX512_H_
3821+
3822+#include "nnacl/intrinsics/ms_simd_instructions.h"
3823+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
3824+
3825+#ifdef __cplusplus
3826+extern "C" {
3827+#endif
3828+#pragma GCC push_options
3829+#pragma GCC target("avx512f")
3830+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
3831+#define BLOCK_NUM 16
3832+#define MS_SIMD_AVX512
3833+#ifdef MS_SIMD_AVX512
3834+  static inline size_t AdamWeightDecayFp32AVX512(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
3835+    const float *gradient, size_t end) {
3836+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
3837+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
3838+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
3839+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
3840+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
3841+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
3842+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
3843+
3844+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3845+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
3846+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
3847+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
3848+    SIMD_F32 g_r = SIMD_LD_F32(gradient + index);
3849+
3850+    m_r = SIMD_MUL_F32(m_r, beta1_r);
3851+    v_r = SIMD_MUL_F32(v_r, beta2_r);
3852+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
3853+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
3854+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
3855+    avx_r0 = SIMD_SQRT_F32(v_r);
3856+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
3857+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
3858+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
3859+    SIMD_ST_F32(m + index, m_r);
3860+    SIMD_ST_F32(v + index, v_r);
3861+    SIMD_ST_F32(var + index, var_r);
3862+  }
3863+
3864+  return index;
3865+}
3866+
3867+static inline size_t FusedCastAdamFp32Fp16AVX512(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
3868+    float global_norm_reciprocal, size_t end) {
3869+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
3870+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
3871+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
3872+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
3873+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
3874+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
3875+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
3876+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
3877+
3878+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3879+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
3880+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
3881+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
3882+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
3883+
3884+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
3885+    m_r = SIMD_MUL_F32(m_r, beta1_r);
3886+    v_r = SIMD_MUL_F32(v_r, beta2_r);
3887+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
3888+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
3889+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
3890+    avx_r0 = SIMD_SQRT_F32(v_r);
3891+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
3892+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
3893+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
3894+    SIMD_ST_F32(var + index, var_r);
3895+    SIMD_ST_F32(m + index, m_r);
3896+    SIMD_ST_F32(v + index, v_r);
3897+  }
3898+
3899+  return index;
3900+}
3901+
3902+static inline size_t FusedCastAdamFp32Fp32AVX512(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
3903+    float global_norm_reciprocal, size_t end) {
3904+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
3905+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
3906+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
3907+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
3908+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
3909+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
3910+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
3911+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
3912+
3913+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3914+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
3915+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
3916+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
3917+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
3918+
3919+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
3920+    m_r = SIMD_MUL_F32(m_r, beta1_r);
3921+    v_r = SIMD_MUL_F32(v_r, beta2_r);
3922+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
3923+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
3924+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
3925+    avx_r0 = SIMD_SQRT_F32(v_r);
3926+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
3927+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
3928+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
3929+    SIMD_ST_F32(var + index, var_r);
3930+    SIMD_ST_F32(m + index, m_r);
3931+    SIMD_ST_F32(v + index, v_r);
3932+  }
3933+
3934+  return index;
3935+}
3936+
3937+static inline size_t FusedCastAdamFp16Fp16AVX512(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
3938+    float global_norm_reciprocal, size_t end) {
3939+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
3940+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
3941+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
3942+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
3943+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
3944+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
3945+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
3946+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
3947+
3948+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3949+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
3950+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
3951+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
3952+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
3953+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
3954+    m_r = SIMD_MUL_F32(m_r, beta1_r);
3955+    v_r = SIMD_MUL_F32(v_r, beta2_r);
3956+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
3957+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
3958+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
3959+    avx_r0 = SIMD_SQRT_F32(v_r);
3960+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
3961+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
3962+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
3963+    SIMD_ST_F32(m + index, m_r);
3964+    SIMD_ST_F32(v + index, v_r);
3965+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
3966+  }
3967+
3968+  return index;
3969+}
3970+
3971+static inline size_t FusedCastAdamFp16Fp32AVX512(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
3972+    float global_norm_reciprocal, size_t end) {
3973+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
3974+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
3975+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
3976+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
3977+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
3978+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
3979+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
3980+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
3981+
3982+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
3983+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
3984+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
3985+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
3986+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
3987+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
3988+    m_r = SIMD_MUL_F32(m_r, beta1_r);
3989+    v_r = SIMD_MUL_F32(v_r, beta2_r);
3990+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
3991+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
3992+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
3993+    avx_r0 = SIMD_SQRT_F32(v_r);
3994+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
3995+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
3996+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
3997+    SIMD_ST_F32(m + index, m_r);
3998+    SIMD_ST_F32(v + index, v_r);
3999+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
4000+  }
4001+
4002+  return index;
4003+}
4004+#endif
4005+
4006+#undef MS_SIMD_INSTRUCTION
4007+#undef BLOCK_NUM
4008+#pragma GCC pop_options
4009+#undef MS_SIMD_AVX512
4010+#ifdef __cplusplus
4011+}
4012+#endif
4013+#endif
4014diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h
4015new file mode 100644
4016index 00000000..5ec6a42e
4017--- /dev/null
4018+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h
4019@@ -0,0 +1,124 @@
4020+/**
4021+ * Copyright 2022 Huawei Technologies Co., Ltd
4022+ *
4023+ * Licensed under the Apache License, Version 2.0 (the "License");
4024+ * you may not use this file except in compliance with the License.
4025+ * You may obtain a copy of the License at
4026+ *
4027+ * http://www.apache.org/licenses/LICENSE-2.0
4028+ *
4029+ * Unless required by applicable law or agreed to in writing, software
4030+ * distributed under the License is distributed on an "AS IS" BASIS,
4031+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
4032+ * See the License for the specific language governing permissions and
4033+ * limitations under the License.
4034+ */
4035+
4036+#ifndef MINDSPORE_NNACL_FP32_ADD_AVX512_H_
4037+#define MINDSPORE_NNACL_FP32_ADD_AVX512_H_
4038+
4039+#include "nnacl/intrinsics/ms_simd_instructions.h"
4040+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
4041+
4042+#ifdef __cplusplus
4043+extern "C" {
4044+#endif
4045+#pragma GCC push_options
4046+#pragma GCC target("avx512f")
4047+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
4048+#define BLOCK_NUM 16
4049+#define MS_SIMD_AVX512
4050+
4051+static inline int ElementOptAddAVX512(int index, const float *in0, const float *in1, float *out, int size) {
4052+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
4053+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4054+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
4055+    SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1);
4056+    SIMD_ST_F32(out + index, vout);
4057+  }
4058+  return index;
4059+}
4060+
4061+static inline int ElementOptAddIntAVX512(int index, const int *in0, const int *in1, int *out,
4062+                                                     int size) {
4063+  SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]);
4064+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4065+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
4066+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1);
4067+    SIMD_ST_EPI32(out + index, vout);
4068+  }
4069+  return index;
4070+}
4071+
4072+static inline int ElementOptAddReluAVX512(int index, const float *in0, const float *in1, float *out,
4073+                                                      int size) {
4074+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
4075+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4076+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
4077+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f);
4078+    SIMD_ST_F32(out + index, vout);
4079+  }
4080+  return index;
4081+}
4082+
4083+static inline int ElementOptAddRelu6AVX512(int index, const float *in0, const float *in1, float *out,
4084+                                                       int size) {
4085+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
4086+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4087+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
4088+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f);
4089+    SIMD_ST_F32(out + index, vout);
4090+  }
4091+  return index;
4092+}
4093+
4094+static inline int ElementAddAVX512(int index, const float *in0, const float *in1, float *out, int size) {
4095+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4096+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
4097+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
4098+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
4099+    SIMD_ST_F32(out + index, vout);
4100+  }
4101+  return index;
4102+}
4103+
4104+static inline int ElementAddReluAVX512(int index, const float *in0, const float *in1, float *out,
4105+                                                   int size) {
4106+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4107+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
4108+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
4109+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f);
4110+    SIMD_ST_F32(out + index, vout);
4111+  }
4112+  return index;
4113+}
4114+
4115+static inline int ElementAddRelu6AVX512(int index, const float *in0, const float *in1, float *out,
4116+                                                    int size) {
4117+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4118+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
4119+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
4120+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f);
4121+    SIMD_ST_F32(out + index, vout);
4122+  }
4123+  return index;
4124+}
4125+
4126+static inline int ElementAddIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
4127+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4128+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
4129+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
4130+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1);
4131+    SIMD_ST_EPI32(out + index, vout);
4132+  }
4133+  return index;
4134+}
4135+
4136+#undef MS_SIMD_INSTRUCTION
4137+#undef BLOCK_NUM
4138+#pragma GCC pop_options
4139+#undef MS_SIMD_AVX512
4140+#ifdef __cplusplus
4141+}
4142+#endif
4143+#endif
4144diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h
4145new file mode 100644
4146index 00000000..aa478969
4147--- /dev/null
4148+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h
4149@@ -0,0 +1,254 @@
4150+/**
4151+ * Copyright 2022 Huawei Technologies Co., Ltd
4152+ *
4153+ * Licensed under the Apache License, Version 2.0 (the "License");
4154+ * you may not use this file except in compliance with the License.
4155+ * You may obtain a copy of the License at
4156+ *
4157+ * http://www.apache.org/licenses/LICENSE-2.0
4158+ *
4159+ * Unless required by applicable law or agreed to in writing, software
4160+ * distributed under the License is distributed on an "AS IS" BASIS,
4161+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
4162+ * See the License for the specific language governing permissions and
4163+ * limitations under the License.
4164+ */
4165+
4166+#ifndef MINDSPORE_NNACL_ARITHMETIC_AVX512_H_
4167+#define MINDSPORE_NNACL_ARITHMETIC_AVX512_H_
4168+
4169+#include "nnacl/intrinsics/ms_simd_instructions.h"
4170+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
4171+
4172+#ifdef __cplusplus
4173+extern "C" {
4174+#endif
4175+#pragma GCC push_options
4176+#pragma GCC target("avx512f")
4177+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
4178+#define BLOCK_NUM 16
4179+#define MS_SIMD_AVX512
4180+
4181+#ifndef MS_SIMD_NEON
4182+static inline int ElementFloorModAVX512(int index, const float *in0, const float *in1, float *out, int size) {
4183+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4184+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
4185+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
4186+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
4187+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
4188+    SIMD_ST_F32(out + index, out_tmp);
4189+  }
4190+  return index;
4191+}
4192+
4193+static inline int ElementOptFloorModNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
4194+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
4195+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4196+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
4197+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
4198+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
4199+    SIMD_ST_F32(out + index, out_tmp);
4200+  }
4201+  return index;
4202+}
4203+
4204+static inline int ElementOptFloorModNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
4205+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
4206+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4207+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
4208+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
4209+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
4210+    SIMD_ST_F32(out + index, out_tmp);
4211+  }
4212+  return index;
4213+}
4214+
4215+static inline int ElementFloorDivAVX512(int index, const float *in0, const float *in1, float *out, int size) {
4216+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4217+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
4218+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
4219+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
4220+    SIMD_ST_F32(out + index, floor_tmp);
4221+  }
4222+  return index;
4223+}
4224+
4225+static inline int ElementOptFloorDivNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
4226+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
4227+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4228+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
4229+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
4230+    SIMD_ST_F32(out + index, out_tmp);
4231+  }
4232+  return index;
4233+}
4234+
4235+static inline int ElementOptFloorDivNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
4236+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
4237+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4238+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
4239+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
4240+    SIMD_ST_F32(out + index, out_tmp);
4241+  }
4242+  return index;
4243+}
4244+#endif
4245+
4246+static inline int ElementFloorDivIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
4247+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4248+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
4249+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
4250+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
4251+    SIMD_ST_EPI32(out + index, out_tmp);
4252+  }
4253+  return index;
4254+}
4255+
4256+static inline int ElementOptFloorDivIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
4257+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
4258+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4259+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
4260+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
4261+    SIMD_ST_EPI32(out + index, out_tmp);
4262+  }
4263+  return index;
4264+}
4265+
4266+static inline int ElementOptFloorDivIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
4267+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
4268+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4269+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
4270+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
4271+    SIMD_ST_EPI32(out + index, out_tmp);
4272+  }
4273+  return index;
4274+}
4275+
4276+static inline int ElementMaximumAVX512(int index, const float *in0, const float *in1, float *out, int size) {
4277+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4278+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
4279+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
4280+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
4281+    SIMD_ST_F32(out + index, out_tmp);
4282+  }
4283+  return index;
4284+}
4285+
4286+static inline int ElementOptMaximumNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
4287+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
4288+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4289+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
4290+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
4291+    SIMD_ST_F32(out + index, out_tmp);
4292+  }
4293+  return index;
4294+}
4295+
4296+static inline int ElementOptMaximumNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
4297+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
4298+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4299+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
4300+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
4301+    SIMD_ST_F32(out + index, out_tmp);
4302+  }
4303+  return index;
4304+}
4305+
4306+static inline int ElementMaximumIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
4307+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4308+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
4309+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
4310+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
4311+    SIMD_ST_EPI32(out + index, out_tmp);
4312+  }
4313+  return index;
4314+}
4315+
4316+static inline int ElementOptMaximumIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
4317+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
4318+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4319+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
4320+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
4321+    SIMD_ST_EPI32(out + index, out_tmp);
4322+  }
4323+  return index;
4324+}
4325+
4326+static inline int ElementOptMaximumIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
4327+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
4328+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4329+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
4330+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
4331+    SIMD_ST_EPI32(out + index, out_tmp);
4332+  }
4333+  return index;
4334+}
4335+
4336+static inline int ElementMinimumIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
4337+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4338+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
4339+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
4340+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
4341+    SIMD_ST_EPI32(out + index, out_tmp);
4342+  }
4343+  return index;
4344+}
4345+
4346+static inline int ElementOptMinimumIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
4347+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
4348+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4349+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
4350+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
4351+    SIMD_ST_EPI32(out + index, out_tmp);
4352+  }
4353+  return index;
4354+}
4355+
4356+static inline int ElementOptMinimumIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
4357+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
4358+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4359+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
4360+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
4361+    SIMD_ST_EPI32(out + index, out_tmp);
4362+  }
4363+  return index;
4364+}
4365+
4366+static inline int ElementMinimumAVX512(int index, const float *in0, const float *in1, float *out, int size) {
4367+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4368+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
4369+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
4370+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
4371+    SIMD_ST_F32(out + index, out_tmp);
4372+  }
4373+  return index;
4374+}
4375+
4376+static inline int ElementOptMinimumNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
4377+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
4378+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4379+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
4380+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
4381+    SIMD_ST_F32(out + index, out_tmp);
4382+  }
4383+  return index;
4384+}
4385+
4386+static inline int ElementOptMinimumNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
4387+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
4388+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4389+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
4390+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
4391+    SIMD_ST_F32(out + index, out_tmp);
4392+  }
4393+  return index;
4394+}
4395+
4396+#undef MS_SIMD_INSTRUCTION
4397+#undef BLOCK_NUM
4398+#pragma GCC pop_options
4399+#undef MS_SIMD_AVX512
4400+#ifdef __cplusplus
4401+}
4402+#endif
4403+#endif
4404diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h
4405new file mode 100644
4406index 00000000..c671e327
4407--- /dev/null
4408+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h
4409@@ -0,0 +1,129 @@
4410+/**
4411+ * Copyright 2022 Huawei Technologies Co., Ltd
4412+ *
4413+ * Licensed under the Apache License, Version 2.0 (the "License");
4414+ * you may not use this file except in compliance with the License.
4415+ * You may obtain a copy of the License at
4416+ *
4417+ * http://www.apache.org/licenses/LICENSE-2.0
4418+ *
4419+ * Unless required by applicable law or agreed to in writing, software
4420+ * distributed under the License is distributed on an "AS IS" BASIS,
4421+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
4422+ * See the License for the specific language governing permissions and
4423+ * limitations under the License.
4424+ */
4425+
4426+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_AVX512_H_
4427+#define MINDSPORE_NNACL_ARITHMETIC_SELF_AVX512_H_
4428+
4429+#include "nnacl/intrinsics/ms_simd_instructions.h"
4430+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
4431+
4432+#ifdef __cplusplus
4433+extern "C" {
4434+#endif
4435+#pragma GCC push_options
4436+#pragma GCC target("avx512f")
4437+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
4438+#define BLOCK_NUM 16
4439+#define MS_SIMD_AVX512
4440+
4441+#if defined(MS_SIMD_AVX512)
4442+// only avx512 support abs fp32 instruction
4443+static inline int ElementAbsAVX512(int index, const float *input, float *output, const int element_size) {
4444+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4445+    SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index)));
4446+  }
4447+  return index;
4448+}
4449+
4450+static inline int ElementAbsIntAVX512(int index, const int *input, int *output, const int element_size) {
4451+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4452+    SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index)));
4453+  }
4454+  return index;
4455+}
4456+#endif
4457+
4458+static inline int ElementSquareAVX512(int index, const float *input, float *output, const int element_size) {
4459+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4460+    SIMD_F32 vin = SIMD_LD_F32(input + index);
4461+    SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin));
4462+  }
4463+  return index;
4464+}
4465+
4466+static inline int ElementSqrtAVX512(int index, const float *input, float *output, const int element_size) {
4467+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4468+    SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index)));
4469+  }
4470+  return index;
4471+}
4472+
4473+static inline int ElementRsqrtAVX512(int index, const float *input, float *output, const int element_size) {
4474+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4475+    SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index)));
4476+  }
4477+  return index;
4478+}
4479+
4480+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE)
4481+// avx512 dont support round fp32 instruction
4482+static inline int ElementRoundAVX512(int index, const float *input, float *output, const int element_size) {
4483+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4484+    SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index)));
4485+  }
4486+  return index;
4487+}
4488+#endif
4489+
4490+#ifndef MS_SIMD_NEON
4491+// neon dont support floor fp32 instruction
4492+static inline int ElementFloorAVX512(int index, const float *input, float *output, const int element_size) {
4493+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4494+    SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index)));
4495+  }
4496+  return index;
4497+}
4498+#endif
4499+
4500+#ifndef MS_SIMD_NEON
4501+static inline int ElementCeilAVX512(int index, const float *input, float *output, const int element_size) {
4502+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4503+    SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index)));
4504+  }
4505+  return index;
4506+}
4507+#endif
4508+
4509+static inline int ElementNegativeAVX512(int index, const float *input, float *output, const int element_size) {
4510+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4511+    SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f));
4512+  }
4513+  return index;
4514+}
4515+
4516+static inline int ElementNegativeIntAVX512(int index, const int *input, int *output, const int element_size) {
4517+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4518+    SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1));
4519+  }
4520+  return index;
4521+}
4522+
4523+static inline int ElementReciprocalAVX512(int index, const float *input, float *output, const int element_size) {
4524+  SIMD_F32 num1 = SIMD_MOV_F32(1.0f);
4525+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4526+    SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index)));
4527+  }
4528+  return index;
4529+}
4530+
4531+#undef MS_SIMD_INSTRUCTION
4532+#undef BLOCK_NUM
4533+#pragma GCC pop_options
4534+#undef MS_SIMD_AVX512
4535+#ifdef __cplusplus
4536+}
4537+#endif
4538+#endif
4539diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h
4540new file mode 100644
4541index 00000000..fd945984
4542--- /dev/null
4543+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h
4544@@ -0,0 +1,67 @@
4545+/**
4546+ * Copyright 2022 Huawei Technologies Co., Ltd
4547+ *
4548+ * Licensed under the Apache License, Version 2.0 (the "License");
4549+ * you may not use this file except in compliance with the License.
4550+ * You may obtain a copy of the License at
4551+ *
4552+ * http://www.apache.org/licenses/LICENSE-2.0
4553+ *
4554+ * Unless required by applicable law or agreed to in writing, software
4555+ * distributed under the License is distributed on an "AS IS" BASIS,
4556+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
4557+ * See the License for the specific language governing permissions and
4558+ * limitations under the License.
4559+ */
4560+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
4561+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
4562+
4563+#include "nnacl/intrinsics/ms_simd_instructions.h"
4564+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
4565+
4566+#ifdef __cplusplus
4567+extern "C" {
4568+#endif
4569+#pragma GCC push_options
4570+#pragma GCC target("avx512f")
4571+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
4572+#define BLOCK_NUM 16
4573+#define MS_SIMD_AVX512
4574+
4575+static inline int BatchNormFp32AVX512(int index, const float *input, const float *mean,
4576+  const float *variance, int channel, float epsilon, float *output) {
4577+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4578+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
4579+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
4580+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
4581+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
4582+    SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
4583+    SIMD_ST_F32(output + index, output_data);
4584+  }
4585+  return index;
4586+}
4587+
4588+static inline int FusedBatchNormFp32AVX512(int index, const float *input, const float *scale,
4589+  const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) {
4590+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4591+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
4592+    SIMD_F32 scale_ = SIMD_LD_F32(scale + index);
4593+    SIMD_F32 offset_ = SIMD_LD_F32(offset + index);
4594+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
4595+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
4596+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
4597+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
4598+    SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_);
4599+    SIMD_ST_F32(output + index, output_data);
4600+  }
4601+  return index;
4602+}
4603+
4604+#undef MS_SIMD_INSTRUCTION
4605+#undef BLOCK_NUM
4606+#pragma GCC pop_options
4607+#undef MS_SIMD_AVX512
4608+#ifdef __cplusplus
4609+}
4610+#endif
4611+#endif
4612diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h
4613new file mode 100644
4614index 00000000..f5353f61
4615--- /dev/null
4616+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h
4617@@ -0,0 +1,69 @@
4618+/**
4619+ * Copyright 2022 Huawei Technologies Co., Ltd
4620+ *
4621+ * Licensed under the Apache License, Version 2.0 (the "License");
4622+ * you may not use this file except in compliance with the License.
4623+ * You may obtain a copy of the License at
4624+ *
4625+ * http://www.apache.org/licenses/LICENSE-2.0
4626+ *
4627+ * Unless required by applicable law or agreed to in writing, software
4628+ * distributed under the License is distributed on an "AS IS" BASIS,
4629+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
4630+ * See the License for the specific language governing permissions and
4631+ * limitations under the License.
4632+ */
4633+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX512_H_
4634+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX512_H_
4635+
4636+#include "nnacl/intrinsics/ms_simd_instructions.h"
4637+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
4638+
4639+#ifdef __cplusplus
4640+extern "C" {
4641+#endif
4642+#pragma GCC push_options
4643+#pragma GCC target("avx512f")
4644+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
4645+#define BLOCK_NUM 16
4646+#define MS_SIMD_AVX512
4647+
4648+static inline int BCEWithLogitLossAVX512(int index, const float *logits, const float *label,
4649+    const float *weight, const float *pos_weight, int length, bool reduction, float *output,
4650+    float *reduction_sum) {
4651+    SIMD_F32 zero = SIMD_SET0_F32;
4652+    SIMD_F32 ones = SIMD_MOV_F32(1.0f);
4653+    SIMD_F32 middle_output = SIMD_SET0_F32;
4654+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4655+      SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index);
4656+      SIMD_F32 label_tmp = SIMD_LD_F32(label + index);
4657+      SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index);
4658+      SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index);
4659+      SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp);
4660+      SIMD_F32 max_value = neg_logits_tmp;
4661+      max_value = SIMD_MIN_F32(max_value, zero);
4662+      SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value);
4663+      SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones);
4664+      SIMD_F32 log_exp_value =
4665+        SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value))));
4666+      SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp),
4667+                                    SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value)));
4668+      if (reduction) {
4669+        middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output);
4670+      } else {
4671+        SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp));
4672+      }
4673+    }
4674+    if (reduction) {
4675+      *reduction_sum += SIMD_GET_SUM_F32(middle_output);
4676+    }
4677+    return index;
4678+}
4679+#undef MS_SIMD_INSTRUCTION
4680+#undef BLOCK_NUM
4681+#pragma GCC pop_options
4682+#undef MS_SIMD_AVX512
4683+#ifdef __cplusplus
4684+}
4685+#endif
4686+#endif
4687diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h
4688new file mode 100644
4689index 00000000..abdad5ff
4690--- /dev/null
4691+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h
4692@@ -0,0 +1,64 @@
4693+/**
4694+ * Copyright 2022 Huawei Technologies Co., Ltd
4695+ *
4696+ * Licensed under the Apache License, Version 2.0 (the "License");
4697+ * you may not use this file except in compliance with the License.
4698+ * You may obtain a copy of the License at
4699+ *
4700+ * http://www.apache.org/licenses/LICENSE-2.0
4701+ *
4702+ * Unless required by applicable law or agreed to in writing, software
4703+ * distributed under the License is distributed on an "AS IS" BASIS,
4704+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
4705+ * See the License for the specific language governing permissions and
4706+ * limitations under the License.
4707+ */
4708+
4709+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_AVX512_H_
4710+#define MINDSPORE_NNACL_FP32_BIAS_ADD_AVX512_H_
4711+
4712+#include "nnacl/intrinsics/ms_simd_instructions.h"
4713+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
4714+
4715+#ifdef __cplusplus
4716+extern "C" {
4717+#endif
4718+#pragma GCC push_options
4719+#pragma GCC target("avx512f")
4720+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
4721+#define BLOCK_NUM 16
4722+#define MS_SIMD_AVX512
4723+
4724+static inline int BiasAddByInnerCoreAVX512(int index, const float *input, const float *bias, float *output,
4725+                                                       int64_t num) {
4726+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4727+    SIMD_F32 vin0 = SIMD_LD_F32(input + index);
4728+    SIMD_F32 vin1 = SIMD_LD_F32(bias + index);
4729+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
4730+    SIMD_ST_F32(output + index, vout);
4731+  }
4732+  return index;
4733+}
4734+
4735+static inline int BiasAddByBatchCoreAVX512(int index, const float *input, const float *bias, float *output1,
4736+                                                       float *output2, float *output3, float *output4, int64_t num) {
4737+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4738+    SIMD_LDX4_F32(input_data, input + index, num);
4739+    SIMD_F32 bias_data = SIMD_LD_F32(bias + index);
4740+    SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data));
4741+    SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data));
4742+    SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data));
4743+    SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data));
4744+  }
4745+  return index;
4746+}
4747+
4748+#undef MS_SIMD_INSTRUCTION
4749+#undef BLOCK_NUM
4750+#pragma GCC pop_options
4751+#undef MS_SIMD_AVX512
4752+#ifdef __cplusplus
4753+};
4754+#endif
4755+
4756+#endif  // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_
4757diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h
4758new file mode 100644
4759index 00000000..91d52718
4760--- /dev/null
4761+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h
4762@@ -0,0 +1,56 @@
4763+/**
4764+ * Copyright 2022 Huawei Technologies Co., Ltd
4765+ *
4766+ * Licensed under the Apache License, Version 2.0 (the "License");
4767+ * you may not use this file except in compliance with the License.
4768+ * You may obtain a copy of the License at
4769+ *
4770+ * http://www.apache.org/licenses/LICENSE-2.0
4771+ *
4772+ * Unless required by applicable law or agreed to in writing, software
4773+ * distributed under the License is distributed on an "AS IS" BASIS,
4774+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
4775+ * See the License for the specific language governing permissions and
4776+ * limitations under the License.
4777+ */
4778+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_AVX512_H_
4779+#define MINDSPORE_NNACL_BASE_CAST_BASE_AVX512_H_
4780+
4781+#include "nnacl/intrinsics/ms_simd_instructions.h"
4782+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
4783+
4784+#ifdef __cplusplus
4785+extern "C" {
4786+#endif
4787+#pragma GCC push_options
4788+#pragma GCC target("avx512f")
4789+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
4790+#define BLOCK_NUM 16
4791+#define MS_SIMD_AVX512
4792+
4793+static inline int Int32ToFloat32AVX512(int index, const int32_t *input, float *output, int number) {
4794+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4795+    SIMD_EPI32 value = SIMD_LD_EPI32(input + index);
4796+    SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value));
4797+  }
4798+  return index;
4799+}
4800+
4801+#ifndef MS_SIMD_NEON
4802+static inline int Float32ToInt32AVX512(int index, const float *input, int32_t *output, int number) {
4803+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4804+    SIMD_F32 value = SIMD_LD_F32(input + index);
4805+    SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value));
4806+  }
4807+  return index;
4808+}
4809+#endif
4810+
4811+#undef MS_SIMD_INSTRUCTION
4812+#undef BLOCK_NUM
4813+#pragma GCC pop_options
4814+#undef MS_SIMD_AVX512
4815+#ifdef __cplusplus
4816+}
4817+#endif
4818+#endif
4819diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h
4820new file mode 100644
4821index 00000000..11a2abcf
4822--- /dev/null
4823+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h
4824@@ -0,0 +1,70 @@
4825+/**
4826+ * Copyright 2022 Huawei Technologies Co., Ltd
4827+ *
4828+ * Licensed under the Apache License, Version 2.0 (the "License");
4829+ * you may not use this file except in compliance with the License.
4830+ * You may obtain a copy of the License at
4831+ *
4832+ * http://www.apache.org/licenses/LICENSE-2.0
4833+ *
4834+ * Unless required by applicable law or agreed to in writing, software
4835+ * distributed under the License is distributed on an "AS IS" BASIS,
4836+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
4837+ * See the License for the specific language governing permissions and
4838+ * limitations under the License.
4839+ */
4840+#ifndef MINDSPORE_NNACL_FP32_CDIST_AVX512_H_
4841+#define MINDSPORE_NNACL_FP32_CDIST_AVX512_H_
4842+
4843+#include "nnacl/intrinsics/ms_simd_instructions.h"
4844+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
4845+
4846+#ifdef __cplusplus
4847+extern "C" {
4848+#endif
4849+#pragma GCC push_options
4850+#pragma GCC target("avx512f")
4851+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
4852+#define BLOCK_NUM 16
4853+#define MS_SIMD_AVX512
4854+
4855+static inline int64_t CdistTwoNormalOptAVX512(int64_t index, const float *a, const float *b,
4856+                                                          float *out, int64_t size) {
4857+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
4858+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4859+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
4860+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
4861+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
4862+    tmp_vec = SIMD_ABS_F32(tmp_vec);
4863+    result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec);
4864+  }
4865+  *out += SIMD_GET_SUM_F32(result_vec);
4866+
4867+  return index;
4868+}
4869+
4870+static inline int64_t CdistPNormalOptAVX512(int64_t index, const float *a, const float *b,
4871+                                                        float *out, int64_t size, float p) {
4872+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
4873+  SIMD_F32 p_vec = SIMD_MOV_F32(p);
4874+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4875+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
4876+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
4877+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
4878+    tmp_vec = SIMD_ABS_F32(tmp_vec);
4879+    tmp_vec = SIMD_POW_F32(tmp_vec, p_vec);
4880+    result_vec = SIMD_ADD_F32(tmp_vec, result_vec);
4881+  }
4882+  *out += SIMD_GET_SUM_F32(result_vec);
4883+
4884+  return index;
4885+}
4886+
4887+#undef MS_SIMD_INSTRUCTION
4888+#undef BLOCK_NUM
4889+#pragma GCC pop_options
4890+#undef MS_SIMD_AVX512
4891+#ifdef __cplusplus
4892+}
4893+#endif
4894+#endif
4895diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h
4896new file mode 100644
4897index 00000000..f82adabf
4898--- /dev/null
4899+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h
4900@@ -0,0 +1,121 @@
4901+/**
4902+ * Copyright 2022 Huawei Technologies Co., Ltd
4903+ *
4904+ * Licensed under the Apache License, Version 2.0 (the "License");
4905+ * you may not use this file except in compliance with the License.
4906+ * You may obtain a copy of the License at
4907+ *
4908+ * http://www.apache.org/licenses/LICENSE-2.0
4909+ *
4910+ * Unless required by applicable law or agreed to in writing, software
4911+ * distributed under the License is distributed on an "AS IS" BASIS,
4912+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
4913+ * See the License for the specific language governing permissions and
4914+ * limitations under the License.
4915+ */
4916+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_AVX512_H_
4917+#define MINDSPORE_NNACL_FP32_CUMSUM_AVX512_H_
4918+
4919+#include "nnacl/intrinsics/ms_simd_instructions.h"
4920+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
4921+
4922+#ifdef __cplusplus
4923+extern "C" {
4924+#endif
4925+#pragma GCC push_options
4926+#pragma GCC target("avx512f")
4927+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
4928+#define BLOCK_NUM 16
4929+#define MS_SIMD_AVX512
4930+
4931+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
4932+// (a, b, c) -> (0, a,   a+b)    exclusive == true
4933+static inline int64_t CumsumOutputInitWithInputAVX512(int64_t index, const float *layer_input,
4934+  float *layer_output, int inner_dim) {
4935+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4936+    SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index));
4937+  }
4938+  return index;
4939+}
4940+
4941+static inline int64_t CumsumOutputInitWithZeroAVX512(int64_t index, float *layer_output, int inner_dim) {
4942+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4943+    SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f));
4944+  }
4945+  return index;
4946+}
4947+
4948+static inline int64_t CumsumAVX512(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output,
4949+  int inner_dim) {
4950+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4951+    SIMD_F32 input_val = SIMD_LD_F32(layer_input + index);
4952+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index);
4953+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
4954+    SIMD_ST_F32(layer_output + index, out_val);
4955+  }
4956+  return index;
4957+}
4958+
4959+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
4960+// (a, b, c) -> (c+b, c, 0) exclusive==true
4961+static inline int64_t CumsumReverseAVX512(int64_t index, const float *layer_input, float *layer_output,
4962+  float *layer_last_output, int inner_dim) {
4963+
4964+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4965+    SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1);
4966+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1);
4967+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
4968+    SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val);
4969+  }
4970+  return index;
4971+}
4972+
4973+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
4974+// (a, b, c) -> (0, a,   a+b)    exclusive == true
4975+static inline int64_t CumsumIntOutputInitWithInputAVX512(int64_t index, const int *layer_input,
4976+  int *layer_output, int inner_dim) {
4977+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4978+    SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index));
4979+  }
4980+  return index;
4981+}
4982+
4983+static inline int64_t CumsumIntOutputInitWithZeroAVX512(int64_t index, int *layer_output, int inner_dim) {
4984+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4985+    SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f));
4986+  }
4987+  return index;
4988+}
4989+
4990+static inline int64_t CumsumIntAVX512(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
4991+  int inner_dim) {
4992+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
4993+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index);
4994+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index);
4995+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
4996+    SIMD_ST_EPI32(layer_output + index, out_val);
4997+  }
4998+  return index;
4999+}
5000+
5001+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
5002+// (a, b, c) -> (c+b, c, 0) exclusive==true
5003+static inline int64_t CumsumReverseIntAVX512(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
5004+  int inner_dim) {
5005+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5006+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1);
5007+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1);
5008+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
5009+    SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val);
5010+  }
5011+  return index;
5012+}
5013+
5014+#undef MS_SIMD_INSTRUCTION
5015+#undef BLOCK_NUM
5016+#pragma GCC pop_options
5017+#undef MS_SIMD_AVX512
5018+#ifdef __cplusplus
5019+}
5020+#endif
5021+#endif
5022diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h
5023new file mode 100644
5024index 00000000..4de588fb
5025--- /dev/null
5026+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h
5027@@ -0,0 +1,167 @@
5028+/**
5029+ * Copyright 2022 Huawei Technologies Co., Ltd
5030+ *
5031+ * Licensed under the Apache License, Version 2.0 (the "License");
5032+ * you may not use this file except in compliance with the License.
5033+ * You may obtain a copy of the License at
5034+ *
5035+ * http://www.apache.org/licenses/LICENSE-2.0
5036+ *
5037+ * Unless required by applicable law or agreed to in writing, software
5038+ * distributed under the License is distributed on an "AS IS" BASIS,
5039+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5040+ * See the License for the specific language governing permissions and
5041+ * limitations under the License.
5042+ */
5043+
5044+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX512_H_
5045+#define MINDSPORE_NNACL_FP32_DIV_AVX512_H_
5046+
5047+#include "nnacl/intrinsics/ms_simd_instructions.h"
5048+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5049+
5050+#ifdef __cplusplus
5051+extern "C" {
5052+#endif
5053+#pragma GCC push_options
5054+#pragma GCC target("avx512f")
5055+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5056+#define BLOCK_NUM 16
5057+#define MS_SIMD_AVX512
5058+
5059+static inline int ElementOptDivNum0AVX512(int index, const float *in0, const float *in1, float *out,
5060+                                                      int size) {
5061+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
5062+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5063+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5064+    SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1);
5065+    SIMD_ST_F32(out + index, vout);
5066+  }
5067+  return index;
5068+}
5069+
5070+static inline int ElementOptDivNum1AVX512(int index, const float *in0, const float *in1, float *out,
5071+                                                      int size) {
5072+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
5073+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5074+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5075+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_);
5076+    SIMD_ST_F32(out + index, vout);
5077+  }
5078+  return index;
5079+}
5080+
5081+static inline int ElementOptDivIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
5082+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
5083+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5084+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
5085+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1);
5086+    SIMD_ST_EPI32(out + index, vout);
5087+  }
5088+  return index;
5089+}
5090+
5091+static inline int ElementOptDivIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
5092+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
5093+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5094+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
5095+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_);
5096+    SIMD_ST_EPI32(out + index, vout);
5097+  }
5098+  return index;
5099+}
5100+
5101+static inline int ElementOptDivReluNum0AVX512(int index, const float *in0, const float *in1, float *out,
5102+                                                          int size) {
5103+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
5104+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5105+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5106+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f);
5107+    SIMD_ST_F32(out + index, vout);
5108+  }
5109+  return index;
5110+}
5111+
5112+static inline int ElementOptDivReluNum1AVX512(int index, const float *in0, const float *in1, float *out,
5113+                                                          int size) {
5114+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
5115+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5116+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5117+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f);
5118+    SIMD_ST_F32(out + index, vout);
5119+  }
5120+  return index;
5121+}
5122+
5123+static inline int ElementOptDivRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out,
5124+                                                           int size) {
5125+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
5126+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5127+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5128+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f);
5129+    SIMD_ST_F32(out + index, vout);
5130+  }
5131+  return index;
5132+}
5133+
5134+static inline int ElementOptDivRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out,
5135+                                                           int size) {
5136+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
5137+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5138+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5139+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f);
5140+    SIMD_ST_F32(out + index, vout);
5141+  }
5142+  return index;
5143+}
5144+
5145+static inline int ElementDivAVX512(int index, const float *in0, const float *in1, float *out, int size) {
5146+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5147+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5148+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5149+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1);
5150+    SIMD_ST_F32(out + index, vout);
5151+  }
5152+  return index;
5153+}
5154+
5155+static inline int ElementDivIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
5156+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5157+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
5158+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
5159+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1);
5160+    SIMD_ST_EPI32(out + index, vout);
5161+  }
5162+  return index;
5163+}
5164+
5165+static inline int ElementDivReluAVX512(int index, const float *in0, const float *in1, float *out,
5166+                                                   int size) {
5167+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5168+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5169+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5170+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f);
5171+    SIMD_ST_F32(out + index, vout);
5172+  }
5173+  return index;
5174+}
5175+
5176+static inline int ElementDivRelu6AVX512(int index, const float *in0, const float *in1, float *out,
5177+                                                    int size) {
5178+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5179+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5180+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5181+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f);
5182+    SIMD_ST_F32(out + index, vout);
5183+  }
5184+  return index;
5185+}
5186+
5187+#undef MS_SIMD_INSTRUCTION
5188+#undef BLOCK_NUM
5189+#pragma GCC pop_options
5190+#undef MS_SIMD_AVX512
5191+#ifdef __cplusplus
5192+};
5193+#endif
5194+#endif
5195diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h
5196new file mode 100644
5197index 00000000..eb847c23
5198--- /dev/null
5199+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h
5200@@ -0,0 +1,46 @@
5201+/**
5202+ * Copyright 2022 Huawei Technologies Co., Ltd
5203+ *
5204+ * Licensed under the Apache License, Version 2.0 (the "License");
5205+ * you may not use this file except in compliance with the License.
5206+ * You may obtain a copy of the License at
5207+ *
5208+ * http://www.apache.org/licenses/LICENSE-2.0
5209+ *
5210+ * Unless required by applicable law or agreed to in writing, software
5211+ * distributed under the License is distributed on an "AS IS" BASIS,
5212+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5213+ * See the License for the specific language governing permissions and
5214+ * limitations under the License.
5215+ */
5216+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX512_H_
5217+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX512_H_
5218+
5219+#include "nnacl/intrinsics/ms_simd_instructions.h"
5220+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5221+
5222+#ifdef __cplusplus
5223+extern "C" {
5224+#endif
5225+#pragma GCC push_options
5226+#pragma GCC target("avx512f")
5227+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5228+#define BLOCK_NUM 16
5229+#define MS_SIMD_AVX512
5230+
5231+static inline int DropoutFp32AVX512(int index, const float *input, float scale,
5232+    int length, float *output) {
5233+    SIMD_F32 scale_value = SIMD_MOV_F32(scale);
5234+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5235+        SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value));
5236+    }
5237+    return index;
5238+}
5239+#undef MS_SIMD_INSTRUCTION
5240+#undef BLOCK_NUM
5241+#pragma GCC pop_options
5242+#undef MS_SIMD_AVX512
5243+#ifdef __cplusplus
5244+}
5245+#endif
5246+#endif
5247diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h
5248new file mode 100644
5249index 00000000..14386f5f
5250--- /dev/null
5251+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h
5252@@ -0,0 +1,63 @@
5253+/**
5254+ * Copyright 2022 Huawei Technologies Co., Ltd
5255+ *
5256+ * Licensed under the Apache License, Version 2.0 (the "License");
5257+ * you may not use this file except in compliance with the License.
5258+ * You may obtain a copy of the License at
5259+ *
5260+ * http://www.apache.org/licenses/LICENSE-2.0
5261+ *
5262+ * Unless required by applicable law or agreed to in writing, software
5263+ * distributed under the License is distributed on an "AS IS" BASIS,
5264+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5265+ * See the License for the specific language governing permissions and
5266+ * limitations under the License.
5267+ */
5268+
5269+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX512_H_
5270+#define MINDSPORE_NNACL_FP32_DIV_AVX512_H_
5271+
5272+#include "nnacl/intrinsics/ms_simd_instructions.h"
5273+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5274+
5275+#ifdef __cplusplus
5276+extern "C" {
5277+#endif
5278+#pragma GCC push_options
5279+#pragma GCC target("avx512f")
5280+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5281+#define BLOCK_NUM 16
5282+#define MS_SIMD_AVX512
5283+
5284+static inline int64_t ExpFp32AVX512(int64_t index, const float *src, float *dst, int num) {
5285+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5286+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
5287+  }
5288+  return index;
5289+}
5290+
5291+static inline int64_t ExpFp32WithInScaleAVX512(int64_t index, const float *src, float *dst, int num, float in_scale) {
5292+  SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale);
5293+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5294+    SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index);
5295+  }
5296+  return index;
5297+}
5298+
5299+static inline int64_t ExpFp32WithOutScaleAVX512(int64_t index, const float *src, float *dst, int num, float out_scale) {
5300+  SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale);
5301+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5302+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
5303+    SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec));
5304+  }
5305+  return index;
5306+}
5307+
5308+#undef MS_SIMD_INSTRUCTION
5309+#undef BLOCK_NUM
5310+#pragma GCC pop_options
5311+#undef MS_SIMD_AVX512
5312+#ifdef __cplusplus
5313+};
5314+#endif
5315+#endif
5316diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h
5317new file mode 100644
5318index 00000000..5eb04746
5319--- /dev/null
5320+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h
5321@@ -0,0 +1,53 @@
5322+/**
5323+ * Copyright 2022 Huawei Technologies Co., Ltd
5324+ *
5325+ * Licensed under the Apache License, Version 2.0 (the "License");
5326+ * you may not use this file except in compliance with the License.
5327+ * You may obtain a copy of the License at
5328+ *
5329+ * http://www.apache.org/licenses/LICENSE-2.0
5330+ *
5331+ * Unless required by applicable law or agreed to in writing, software
5332+ * distributed under the License is distributed on an "AS IS" BASIS,
5333+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5334+ * See the License for the specific language governing permissions and
5335+ * limitations under the License.
5336+ */
5337+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_AVX512_H_
5338+#define MINDSPORE_NNACL_BASE_FILL_BASE_AVX512_H_
5339+
5340+#include "nnacl/intrinsics/ms_simd_instructions.h"
5341+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5342+
5343+#ifdef __cplusplus
5344+extern "C" {
5345+#endif
5346+#pragma GCC push_options
5347+#pragma GCC target("avx512f")
5348+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5349+#define BLOCK_NUM 16
5350+#define MS_SIMD_AVX512
5351+
5352+static inline int FillFp32AVX512(int index, float *output, int size, float data) {
5353+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5354+    SIMD_ST_F32(output + index, SIMD_MOV_F32(data));
5355+  }
5356+  return index;
5357+}
5358+
5359+static inline int FillInt32AVX512(int index, int *output, int size, int data) {
5360+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5361+    SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data));
5362+  }
5363+  return index;
5364+}
5365+
5366+#undef MS_SIMD_INSTRUCTION
5367+#undef BLOCK_NUM
5368+#pragma GCC pop_options
5369+#undef MS_SIMD_AVX512
5370+#ifdef __cplusplus
5371+}
5372+#endif
5373+#endif
5374+
5375diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h
5376new file mode 100644
5377index 00000000..f26537d9
5378--- /dev/null
5379+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h
5380@@ -0,0 +1,77 @@
5381+/**
5382+ * Copyright 2022 Huawei Technologies Co., Ltd
5383+ *
5384+ * Licensed under the Apache License, Version 2.0 (the "License");
5385+ * you may not use this file except in compliance with the License.
5386+ * You may obtain a copy of the License at
5387+ *
5388+ * http://www.apache.org/licenses/LICENSE-2.0
5389+ *
5390+ * Unless required by applicable law or agreed to in writing, software
5391+ * distributed under the License is distributed on an "AS IS" BASIS,
5392+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5393+ * See the License for the specific language governing permissions and
5394+ * limitations under the License.
5395+ */
5396+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX512_H_
5397+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX512_H_
5398+
5399+#include "nnacl/intrinsics/ms_simd_instructions.h"
5400+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5401+
5402+#ifdef __cplusplus
5403+extern "C" {
5404+#endif
5405+#pragma GCC push_options
5406+#pragma GCC target("avx512f")
5407+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5408+#define BLOCK_NUM 16
5409+#define MS_SIMD_AVX512
5410+
5411+static inline int64_t GroupNormFp32AVX512(int64_t index, const float *unit_input, float scale, float offset, float mean,
5412+  float var_sqrt, int unit, float *unit_output) {
5413+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
5414+  SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt);
5415+  SIMD_F32 scale_val = SIMD_MOV_F32(scale);
5416+  SIMD_F32 offset_val = SIMD_MOV_F32(offset);
5417+  for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5418+    SIMD_F32 input = SIMD_LD_F32(unit_input + index);
5419+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt);
5420+    SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val);
5421+    SIMD_ST_F32(unit_output + index, output);
5422+  }
5423+  return index;
5424+}
5425+
5426+static inline int64_t GroupNormReduceSumAVX512(int64_t index, const float *in, float *sum, int unit) {
5427+  if (unit - index >= 4 * BLOCK_NUM) {
5428+    SIMD_F32 tmp = SIMD_MOV_F32(0);
5429+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5430+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index));
5431+    }
5432+    *sum += SIMD_GET_SUM_F32(tmp);
5433+  }
5434+  return index;
5435+}
5436+
5437+static inline int64_t GroupNormReduceVarAVX512(int64_t index, const float *in, float mean, float *sum, int unit) {
5438+  if (unit - index >= 4 * BLOCK_NUM) {
5439+    SIMD_F32 mean_val = SIMD_MOV_F32(mean);
5440+    SIMD_F32 tmp = SIMD_MOV_F32(0);
5441+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5442+      SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val);
5443+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input));
5444+    }
5445+    *sum += SIMD_GET_SUM_F32(tmp);
5446+  }
5447+  return index;
5448+}
5449+
5450+#undef MS_SIMD_INSTRUCTION
5451+#undef BLOCK_NUM
5452+#pragma GCC pop_options
5453+#undef MS_SIMD_AVX512
5454+#ifdef __cplusplus
5455+}
5456+#endif
5457+#endif
5458diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h
5459new file mode 100644
5460index 00000000..e5fb6d7b
5461--- /dev/null
5462+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h
5463@@ -0,0 +1,68 @@
5464+/**
5465+ * Copyright 2022 Huawei Technologies Co., Ltd
5466+ *
5467+ * Licensed under the Apache License, Version 2.0 (the "License");
5468+ * you may not use this file except in compliance with the License.
5469+ * You may obtain a copy of the License at
5470+ *
5471+ * http://www.apache.org/licenses/LICENSE-2.0
5472+ *
5473+ * Unless required by applicable law or agreed to in writing, software
5474+ * distributed under the License is distributed on an "AS IS" BASIS,
5475+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5476+ * See the License for the specific language governing permissions and
5477+ * limitations under the License.
5478+ */
5479+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX512_H_
5480+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX512_H_
5481+
5482+#include "nnacl/intrinsics/ms_simd_instructions.h"
5483+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5484+
5485+#ifdef __cplusplus
5486+extern "C" {
5487+#endif
5488+#pragma GCC push_options
5489+#pragma GCC target("avx512f")
5490+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5491+#define BLOCK_NUM 16
5492+#define MS_SIMD_AVX512
5493+
5494+static inline int LayerNormMeanAndSquareAVX512(int index, const float *src, int num, float *mean, float *square_mean) {
5495+  if (num >= 4 * BLOCK_NUM) {
5496+    SIMD_F32 sum_val = SIMD_SET0_F32;
5497+    SIMD_F32 square_sum_val = SIMD_SET0_F32;
5498+    for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5499+      SIMD_F32 value = SIMD_LD_F32(src + index);
5500+      SIMD_F32 square_value = SIMD_MUL_F32(value, value);
5501+      sum_val = SIMD_ADD_F32(sum_val, value);
5502+      square_sum_val = SIMD_ADD_F32(square_sum_val, square_value);
5503+    }
5504+    *mean += SIMD_GET_SUM_F32(sum_val);
5505+    *square_mean += SIMD_GET_SUM_F32(square_sum_val);
5506+  }
5507+  return index;
5508+}
5509+
5510+static inline int LayerNormGammaAndBetaAVX512(int index, float *dst, const float *src, const float *gamma_data,
5511+  const float *beta_data, int num, const float mean, const float deno) {
5512+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
5513+  SIMD_F32 deno_val = SIMD_MOV_F32(deno);
5514+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5515+    SIMD_F32 value = SIMD_LD_F32(src + index);
5516+    SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val);
5517+    out_value = SIMD_MUL_F32(out_value, deno_val);
5518+    out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index));
5519+    SIMD_ST_F32(dst + index, out_value);
5520+  }
5521+  return index;
5522+}
5523+
5524+#undef MS_SIMD_INSTRUCTION
5525+#undef BLOCK_NUM
5526+#pragma GCC pop_options
5527+#undef MS_SIMD_AVX512
5528+#ifdef __cplusplus
5529+}
5530+#endif
5531+#endif
5532diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h
5533new file mode 100644
5534index 00000000..d51779d4
5535--- /dev/null
5536+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h
5537@@ -0,0 +1,93 @@
5538+/**
5539+ * Copyright 2022 Huawei Technologies Co., Ltd
5540+ *
5541+ * Licensed under the Apache License, Version 2.0 (the "License");
5542+ * you may not use this file except in compliance with the License.
5543+ * You may obtain a copy of the License at
5544+ *
5545+ * http://www.apache.org/licenses/LICENSE-2.0
5546+ *
5547+ * Unless required by applicable law or agreed to in writing, software
5548+ * distributed under the License is distributed on an "AS IS" BASIS,
5549+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5550+ * See the License for the specific language governing permissions and
5551+ * limitations under the License.
5552+ */
5553+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_AVX512_H_
5554+#define MINDSPORE_NNACL_FP32_MATMUL_F32_AVX512_H_
5555+
5556+#include "nnacl/intrinsics/ms_simd_instructions.h"
5557+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5558+
5559+#ifdef __cplusplus
5560+extern "C" {
5561+#endif
5562+#pragma GCC push_options
5563+#pragma GCC target("avx512f")
5564+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5565+#define BLOCK_NUM 16
5566+#define MS_SIMD_AVX512
5567+
5568+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6.
5569+static inline int64_t GemmIsNotPackAVX512(int64_t index, const float *a, const float *b, float *c, const float *bias, int row,
5570+  int deep, int act_type) {
5571+  SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f);
5572+  SIMD_F32 up_threshold = SIMD_MOV_F32(6);
5573+  SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]);
5574+  SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]);
5575+  for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5576+    SIMD_F32 a_data = SIMD_LD_F32(a + index);
5577+    SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16);
5578+    if (act_type != 0) {
5579+      dst = SIMD_MAX_F32(dst, down_threshold);
5580+      if (act_type == 3) {
5581+        dst = SIMD_MIN_F32(dst, up_threshold);
5582+      }
5583+    }
5584+    SIMD_ST_F32(c + index, dst);
5585+  }
5586+
5587+  return index;
5588+}
5589+
5590+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX)
5591+static inline int64_t GemmIsNotPackOptimizeCoreAVX512(int64_t index, const float *a, const float *b, int k, float *dst) {
5592+  SIMD_F32 dst1 = SIMD_MOV_F32(0.0f);
5593+  for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5594+    SIMD_F32 weight = SIMD_LD_F32(b + index);
5595+    SIMD_F32 a1 = SIMD_LD_F32(a + index);
5596+    dst1 = SIMD_FMADD_F32(weight, a1, dst1);
5597+  }
5598+  *dst += SIMD_REDUCE_ADD_F32(dst1);
5599+  return index;
5600+}
5601+#endif
5602+
5603+static inline int64_t MatVecMulNoPackCoreAVX512(int64_t oc_index, const float *a, const float *b, float *c, const float *bias,
5604+  int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) {
5605+  for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) {
5606+    SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index));
5607+    for (int64_t k = 0; k < depth; ++k) {
5608+      SIMD_F32 left = SIMD_MOV_F32(a[k]);
5609+      SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col);
5610+      out = SIMD_FMADD_F32(left, right, out);
5611+    }
5612+    if ((inc_flag & 0x2) != 0 && act_type != 0) {
5613+      out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f));
5614+      if (act_type == 0x3) {
5615+        out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f));
5616+      }
5617+    }
5618+    SIMD_ST_F32(c + oc_index, out);
5619+  }
5620+  return oc_index;
5621+}
5622+
5623+#undef MS_SIMD_INSTRUCTION
5624+#undef BLOCK_NUM
5625+#pragma GCC pop_options
5626+#undef MS_SIMD_AVX512
5627+#ifdef __cplusplus
5628+}
5629+#endif
5630+#endif
5631diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h
5632new file mode 100644
5633index 00000000..e3b242e4
5634--- /dev/null
5635+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h
5636@@ -0,0 +1,218 @@
5637+/**
5638+ * Copyright 2022 Huawei Technologies Co., Ltd
5639+ *
5640+ * Licensed under the Apache License, Version 2.0 (the "License");
5641+ * you may not use this file except in compliance with the License.
5642+ * You may obtain a copy of the License at
5643+ *
5644+ * http://www.apache.org/licenses/LICENSE-2.0
5645+ *
5646+ * Unless required by applicable law or agreed to in writing, software
5647+ * distributed under the License is distributed on an "AS IS" BASIS,
5648+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5649+ * See the License for the specific language governing permissions and
5650+ * limitations under the License.
5651+ */
5652+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
5653+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_
5654+
5655+#include "nnacl/intrinsics/ms_simd_instructions.h"
5656+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5657+
5658+#ifdef __cplusplus
5659+extern "C" {
5660+#endif
5661+#pragma GCC push_options
5662+#pragma GCC target("avx512f")
5663+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5664+#define BLOCK_NUM 16
5665+#define MS_SIMD_AVX512
5666+
5667+static inline int ElementMulAVX512(int index, const float *in0, const float *in1, float *out, int size) {
5668+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5669+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5670+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5671+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1);
5672+    SIMD_ST_F32(out + index, vout);
5673+  }
5674+  return index;
5675+}
5676+
5677+static inline int ElementMulReluAVX512(int index, const float *in0, const float *in1, float *out, int size) {
5678+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5679+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5680+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5681+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f);
5682+    SIMD_ST_F32(out + index, vout);
5683+  }
5684+  return index;
5685+}
5686+
5687+static inline int ElementMulRelu6AVX512(int index, const float *in0, const float *in1, float *out, int size) {
5688+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5689+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5690+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5691+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f);
5692+    SIMD_ST_F32(out + index, vout);
5693+  }
5694+  return index;
5695+}
5696+
5697+static inline int ElementMulIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
5698+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5699+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
5700+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
5701+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1);
5702+    SIMD_ST_EPI32(out + index, vout);
5703+  }
5704+  return index;
5705+}
5706+
5707+static inline int ElementMulReluIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
5708+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5709+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
5710+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
5711+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f);
5712+    SIMD_ST_EPI32(out + index, vout);
5713+  }
5714+  return index;
5715+}
5716+
5717+static inline int ElementMulRelu6IntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
5718+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5719+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
5720+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
5721+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f);
5722+    SIMD_ST_EPI32(out + index, vout);
5723+  }
5724+  return index;
5725+}
5726+
5727+static inline int ElementOptMulNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
5728+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
5729+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5730+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5731+    SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1);
5732+    SIMD_ST_F32(out + index, vout);
5733+  }
5734+  return index;
5735+}
5736+
5737+static inline int ElementOptMulNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
5738+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
5739+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5740+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5741+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_);
5742+    SIMD_ST_F32(out + index, vout);
5743+  }
5744+  return index;
5745+}
5746+
5747+static inline int ElementOptMulReluNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
5748+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
5749+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5750+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5751+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f);
5752+    SIMD_ST_F32(out + index, vout);
5753+  }
5754+  return index;
5755+}
5756+
5757+static inline int ElementOptMulReluNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
5758+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
5759+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5760+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5761+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f);
5762+    SIMD_ST_F32(out + index, vout);
5763+  }
5764+  return index;
5765+}
5766+
5767+static inline int ElementOptMulRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out, int size) {
5768+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
5769+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5770+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
5771+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f);
5772+    SIMD_ST_F32(out + index, vout);
5773+  }
5774+  return index;
5775+}
5776+
5777+static inline int ElementOptMulRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out, int size) {
5778+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
5779+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5780+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
5781+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f);
5782+    SIMD_ST_F32(out + index, vout);
5783+  }
5784+  return index;
5785+}
5786+
5787+static inline int ElementOptMulIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
5788+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
5789+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5790+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
5791+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1);
5792+    SIMD_ST_EPI32(out + index, vout);
5793+  }
5794+  return index;
5795+}
5796+
5797+static inline int ElementOptMulIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
5798+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
5799+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5800+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
5801+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_);
5802+    SIMD_ST_EPI32(out + index, vout);
5803+  }
5804+  return index;
5805+}
5806+
5807+static inline int ElementOptMulReluIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
5808+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
5809+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5810+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
5811+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f);
5812+    SIMD_ST_EPI32(out + index, vout);
5813+  }
5814+  return index;
5815+}
5816+
5817+static inline int ElementOptMulReluIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
5818+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
5819+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5820+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
5821+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f);
5822+    SIMD_ST_EPI32(out + index, vout);
5823+  }
5824+  return index;
5825+}
5826+
5827+static inline int ElementOptMulRelu6IntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
5828+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
5829+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5830+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
5831+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f);
5832+    SIMD_ST_EPI32(out + index, vout);
5833+  }
5834+  return index;
5835+}
5836+
5837+static inline int ElementOptMulRelu6IntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
5838+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
5839+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5840+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
5841+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f);
5842+    SIMD_ST_EPI32(out + index, vout);
5843+  }
5844+  return index;
5845+}
5846+
5847+#undef MS_SIMD_INSTRUCTION
5848+#undef BLOCK_NUM
5849+#pragma GCC pop_options
5850+#undef MS_SIMD_AVX512
5851+#ifdef __cplusplus
5852+}
5853+#endif
5854+#endif
5855diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h
5856new file mode 100644
5857index 00000000..d1e001ee
5858--- /dev/null
5859+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h
5860@@ -0,0 +1,84 @@
5861+/**
5862+ * Copyright 2022 Huawei Technologies Co., Ltd
5863+ *
5864+ * Licensed under the Apache License, Version 2.0 (the "License");
5865+ * you may not use this file except in compliance with the License.
5866+ * You may obtain a copy of the License at
5867+ *
5868+ * http://www.apache.org/licenses/LICENSE-2.0
5869+ *
5870+ * Unless required by applicable law or agreed to in writing, software
5871+ * distributed under the License is distributed on an "AS IS" BASIS,
5872+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5873+ * See the License for the specific language governing permissions and
5874+ * limitations under the License.
5875+ */
5876+#ifndef MINDSPORE_NNACL_FP32_POOLING_AVX512_H_
5877+#define MINDSPORE_NNACL_FP32_POOLING_AVX512_H_
5878+
5879+#include "nnacl/intrinsics/ms_simd_instructions.h"
5880+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5881+
5882+#ifdef __cplusplus
5883+extern "C" {
5884+#endif
5885+#pragma GCC push_options
5886+#pragma GCC target("avx512f")
5887+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5888+#define BLOCK_NUM 16
5889+#define MS_SIMD_AVX512
5890+
5891+static inline int AvgPoolingBatchAVX512(int ci, const float *src_plane_ptr, int channel,
5892+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
5893+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
5894+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
5895+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
5896+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
5897+    const float *src_c_ptr = src_plane_ptr + ci;
5898+    float *dst_c_ptr = dst_plane_ptr + ci;
5899+    SIMD_F32 tmp_avg = SIMD_SET0_F32;
5900+    int real_count = 0;
5901+    for (int h = real_win_h_start; h < real_win_h_end; h++) {
5902+      for (int w = real_win_w_start; w < real_win_w_end; w++) {
5903+        const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
5904+        tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr));
5905+        ++real_count;
5906+      }
5907+    }
5908+    tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count));
5909+    tmp_avg = SIMD_MAX_F32(tmp_avg, min_val);
5910+    tmp_avg = SIMD_MIN_F32(tmp_avg, max_val);
5911+    SIMD_ST_F32(dst_c_ptr, tmp_avg);
5912+  }
5913+  return ci;
5914+}
5915+
5916+static inline int MaxPoolingBatchAVX512(int ci, const float *src_plane_ptr, int channel,
5917+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
5918+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
5919+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
5920+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
5921+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
5922+    const float *src_c_ptr = src_plane_ptr + ci;
5923+    float *dst_c_ptr = dst_plane_ptr + ci;
5924+    SIMD_F32 tmp_max = min_val;
5925+    for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
5926+      for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
5927+        const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
5928+        tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr));
5929+      }
5930+    }
5931+    tmp_max = SIMD_MIN_F32(tmp_max, max_val);
5932+    SIMD_ST_F32(dst_c_ptr, tmp_max);
5933+  }
5934+  return ci;
5935+}
5936+
5937+#undef MS_SIMD_INSTRUCTION
5938+#undef BLOCK_NUM
5939+#pragma GCC pop_options
5940+#undef MS_SIMD_AVX512
5941+#ifdef __cplusplus
5942+}
5943+#endif
5944+#endif
5945diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h
5946new file mode 100644
5947index 00000000..a31eaf2f
5948--- /dev/null
5949+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h
5950@@ -0,0 +1,101 @@
5951+/**
5952+ * Copyright 2022 Huawei Technologies Co., Ltd
5953+ *
5954+ * Licensed under the Apache License, Version 2.0 (the "License");
5955+ * you may not use this file except in compliance with the License.
5956+ * You may obtain a copy of the License at
5957+ *
5958+ * http://www.apache.org/licenses/LICENSE-2.0
5959+ *
5960+ * Unless required by applicable law or agreed to in writing, software
5961+ * distributed under the License is distributed on an "AS IS" BASIS,
5962+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
5963+ * See the License for the specific language governing permissions and
5964+ * limitations under the License.
5965+ */
5966+#ifndef MINDSPORE_NNACL_FP32_POWER_AVX512_H_
5967+#define MINDSPORE_NNACL_FP32_POWER_AVX512_H_
5968+
5969+#include "nnacl/intrinsics/ms_simd_instructions.h"
5970+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
5971+
5972+#ifdef __cplusplus
5973+extern "C" {
5974+#endif
5975+#pragma GCC push_options
5976+#pragma GCC target("avx512f")
5977+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
5978+#define BLOCK_NUM 16
5979+#define MS_SIMD_AVX512
5980+
5981+static inline int PowerBroadCastIntExponentAVX512(int index, const float *input, int exponent, float *output, int len,
5982+  float scale, float shift) {
5983+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
5984+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
5985+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
5986+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
5987+    SIMD_F32 result = SIMD_MOV_F32(1.0f);
5988+    int exp = abs(exponent);
5989+    while (exp) {
5990+      if (exp % 2) {
5991+        result = SIMD_MUL_F32(result, tmp);
5992+      }
5993+      tmp = SIMD_MUL_SQUARE_F32(tmp);
5994+      exp = exp / 2;
5995+    }
5996+    SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result));
5997+  }
5998+  return index;
5999+}
6000+
6001+static inline int PowerBroadCastFloatExponentAVX512(int index, const float *input, float exponent, float *output, int len,
6002+  float scale, float shift) {
6003+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
6004+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
6005+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6006+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
6007+    SIMD_F32 result;
6008+    for (int i = 0; i < BLOCK_NUM; ++i) {
6009+      SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent);
6010+    }
6011+    SIMD_ST_F32(output + index, result);
6012+  }
6013+  return index;
6014+}
6015+
6016+static inline int PowerSingleExponentAVX512(int index, const float *input, const float *exponent, float *output, int len,
6017+  float scale, float shift) {
6018+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
6019+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
6020+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6021+    SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
6022+    for (int j = 0; j < BLOCK_NUM; ++j) {
6023+      float cur_exponent = exponent[index + j];
6024+      float cur_val = SIMD_F32_GETI(tmp_vec, j);
6025+      if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) {
6026+        int exp = abs((int)(cur_exponent));
6027+        float result = 1;
6028+        while (exp) {
6029+          if (exp % 2) {
6030+            result *= cur_val;
6031+          }
6032+          cur_val *= cur_val;
6033+          exp = exp / 2;
6034+        }
6035+        output[index + j] = *exponent >= 0 ? result : 1 / result;
6036+      } else {
6037+        output[index + j] = powf(cur_val, cur_exponent);
6038+      }
6039+    }
6040+  }
6041+  return index;
6042+}
6043+
6044+#undef MS_SIMD_INSTRUCTION
6045+#undef BLOCK_NUM
6046+#pragma GCC pop_options
6047+#undef MS_SIMD_AVX512
6048+#ifdef __cplusplus
6049+}
6050+#endif
6051+#endif
6052diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h
6053new file mode 100644
6054index 00000000..5885a044
6055--- /dev/null
6056+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h
6057@@ -0,0 +1,181 @@
6058+/**
6059+ * Copyright 2022 Huawei Technologies Co., Ltd
6060+ *
6061+ * Licensed under the Apache License, Version 2.0 (the "License");
6062+ * you may not use this file except in compliance with the License.
6063+ * You may obtain a copy of the License at
6064+ *
6065+ * http://www.apache.org/licenses/LICENSE-2.0
6066+ *
6067+ * Unless required by applicable law or agreed to in writing, software
6068+ * distributed under the License is distributed on an "AS IS" BASIS,
6069+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6070+ * See the License for the specific language governing permissions and
6071+ * limitations under the License.
6072+ */
6073+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX512_H_
6074+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX512_H_
6075+
6076+#include "nnacl/intrinsics/ms_simd_instructions.h"
6077+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
6078+
6079+#ifdef __cplusplus
6080+extern "C" {
6081+#endif
6082+#pragma GCC push_options
6083+#pragma GCC target("avx512f")
6084+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
6085+#define BLOCK_NUM 16
6086+#define MS_SIMD_AVX512
6087+
6088+static inline int64_t ReduceSumAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
6089+  int axis_size) {
6090+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6091+    const float *inner_src = outer_src + index;
6092+    SIMD_F32 tmp = SIMD_MOV_F32(0);
6093+    for (int i = 0; i < axis_size; i++) {
6094+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
6095+    }
6096+    SIMD_ST_F32(outer_dst + index, tmp);
6097+  }
6098+  return index;
6099+}
6100+
6101+static inline int64_t ReduceMeanAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
6102+  int axis_size) {
6103+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6104+    const float *inner_src = outer_src + index;
6105+    SIMD_F32 tmp = SIMD_MOV_F32(0);
6106+    for (int i = 0; i < axis_size; i++) {
6107+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
6108+    }
6109+    SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size));
6110+  }
6111+  return index;
6112+}
6113+
6114+static inline int64_t ReduceMinAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
6115+  int axis_size) {
6116+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6117+    const float *inner_src = outer_src + index;
6118+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX);
6119+    for (int i = 0; i < axis_size; i++) {
6120+      tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
6121+    }
6122+    SIMD_ST_F32(outer_dst + index, tmp);
6123+  }
6124+  return index;
6125+}
6126+
6127+static inline int64_t ReduceMaxAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
6128+  int axis_size) {
6129+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6130+    const float *inner_src = outer_src + index;
6131+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN);
6132+    for (int i = 0; i < axis_size; i++) {
6133+      tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
6134+    }
6135+    SIMD_ST_F32(outer_dst + index, tmp);
6136+  }
6137+  return index;
6138+}
6139+
6140+static inline int64_t ReduceProdAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
6141+  int axis_size) {
6142+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6143+    const float *inner_src = outer_src + index;
6144+    SIMD_F32 tmp = SIMD_MOV_F32(1.0f);
6145+    for (int i = 0; i < axis_size; i++) {
6146+      tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
6147+    }
6148+    SIMD_ST_F32(outer_dst + index, tmp);
6149+  }
6150+  return index;
6151+}
6152+
6153+static inline int64_t ReduceSumSquareAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
6154+  int axis_size) {
6155+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6156+    const float *inner_src = outer_src + index;
6157+    SIMD_F32 tmp = SIMD_MOV_F32(0);
6158+    for (int i = 0; i < axis_size; i++) {
6159+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
6160+    }
6161+    SIMD_ST_F32(outer_dst + index, tmp);
6162+  }
6163+  return index;
6164+}
6165+
6166+static inline int64_t ReduceL2NormAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
6167+  int axis_size) {
6168+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6169+    const float *inner_src = outer_src + index;
6170+    SIMD_F32 tmp = SIMD_MOV_F32(0);
6171+    for (int i = 0; i < axis_size; i++) {
6172+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
6173+    }
6174+    SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp));
6175+  }
6176+  return index;
6177+}
6178+
6179+static inline int64_t IntReduceSumAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
6180+  int axis_size) {
6181+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6182+    const int *inner_src = outer_src + index;
6183+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
6184+    for (int i = 0; i < axis_size; i++) {
6185+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
6186+    }
6187+    SIMD_ST_EPI32(outer_dst + index, tmp);
6188+  }
6189+  return index;
6190+}
6191+
6192+static inline int64_t IntReduceMeanAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
6193+  int axis_size) {
6194+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6195+    const int *inner_src = outer_src + index;
6196+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
6197+    for (int i = 0; i < axis_size; i++) {
6198+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
6199+    }
6200+    SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size));
6201+  }
6202+  return index;
6203+}
6204+
6205+static inline int64_t IntReduceMinAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
6206+  int axis_size) {
6207+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6208+    const int *inner_src = outer_src + index;
6209+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX);
6210+    for (int i = 0; i < axis_size; i++) {
6211+      tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
6212+    }
6213+    SIMD_ST_EPI32(outer_dst + index, tmp);
6214+  }
6215+  return index;
6216+}
6217+
6218+static inline int64_t IntReduceMaxAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
6219+  int axis_size) {
6220+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6221+    const int *inner_src = outer_src + index;
6222+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN);
6223+    for (int i = 0; i < axis_size; i++) {
6224+      tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
6225+    }
6226+    SIMD_ST_EPI32(outer_dst + index, tmp);
6227+  }
6228+  return index;
6229+}
6230+
6231+#undef MS_SIMD_INSTRUCTION
6232+#undef BLOCK_NUM
6233+#pragma GCC pop_options
6234+#undef MS_SIMD_AVX512
6235+#ifdef __cplusplus
6236+}
6237+#endif
6238+#endif
6239diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h
6240new file mode 100644
6241index 00000000..1fa1907e
6242--- /dev/null
6243+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h
6244@@ -0,0 +1,87 @@
6245+/**
6246+ * Copyright 2022 Huawei Technologies Co., Ltd
6247+ *
6248+ * Licensed under the Apache License, Version 2.0 (the "License");
6249+ * you may not use this file except in compliance with the License.
6250+ * You may obtain a copy of the License at
6251+ *
6252+ * http://www.apache.org/licenses/LICENSE-2.0
6253+ *
6254+ * Unless required by applicable law or agreed to in writing, software
6255+ * distributed under the License is distributed on an "AS IS" BASIS,
6256+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6257+ * See the License for the specific language governing permissions and
6258+ * limitations under the License.
6259+ */
6260+
6261+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_AVX512_H_
6262+#define MINDSPORE_NNACL_FP32_SOFTMAX_AVX512_H_
6263+
6264+#include "nnacl/intrinsics/ms_simd_instructions.h"
6265+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
6266+
6267+#ifdef __cplusplus
6268+extern "C" {
6269+#endif
6270+#pragma GCC push_options
6271+#pragma GCC target("avx512f")
6272+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
6273+#define BLOCK_NUM 16
6274+#define MS_SIMD_AVX512
6275+
6276+static inline int64_t SoftmaxNormGetMaxAVX512(int64_t index, const float *src, int cur_batch_offset,
6277+  float *max, int channel) {
6278+  if (channel >= BLOCK_NUM * BLOCK_NUM) {
6279+    SIMD_F32 max_val = SIMD_MOV_F32(*max);
6280+    for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6281+      max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index));
6282+    }
6283+    *max = SIMD_GET_MAX_F32(max_val);
6284+  }
6285+  return index;
6286+}
6287+
6288+static inline int64_t SoftmaxNormCalcNormAVX512(int64_t index, const float *src, float *dst,
6289+  int cur_batch_offset, float max, int channel) {
6290+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6291+    SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max));
6292+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
6293+  }
6294+  return index;
6295+}
6296+
6297+static inline int64_t SoftmaxLastAxisGetExpSumAVX512(int64_t index, const float *src, float *dst,
6298+  int cur_batch_offset, float max, float *exp_sum, int channel) {
6299+#ifndef _WIN32
6300+  SIMD_F32 sum_val = SIMD_SET0_F32;
6301+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6302+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
6303+    SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max));
6304+    SIMD_F32 exp_out = SIMD_EXP_F32(output);
6305+    sum_val = SIMD_ADD_F32(sum_val, exp_out);
6306+    SIMD_ST_F32(dst + cur_batch_offset + index, exp_out);
6307+  }
6308+  *exp_sum += SIMD_GET_SUM_F32(sum_val);
6309+#endif
6310+  return index;
6311+}
6312+
6313+static inline int64_t SoftmaxLastAxisGetResultAVX512(int64_t index, const float *src, float *dst,
6314+  int cur_batch_offset, float exp_sum, int channel) {
6315+  SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum);
6316+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6317+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
6318+    SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val);
6319+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
6320+  }
6321+  return index;
6322+}
6323+
6324+#undef MS_SIMD_INSTRUCTION
6325+#undef BLOCK_NUM
6326+#pragma GCC pop_options
6327+#undef MS_SIMD_AVX512
6328+#ifdef __cplusplus
6329+};
6330+#endif
6331+#endif
6332diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h
6333new file mode 100644
6334index 00000000..994fc7c0
6335--- /dev/null
6336+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h
6337@@ -0,0 +1,167 @@
6338+/**
6339+ * Copyright 2022 Huawei Technologies Co., Ltd
6340+ *
6341+ * Licensed under the Apache License, Version 2.0 (the "License");
6342+ * you may not use this file except in compliance with the License.
6343+ * You may obtain a copy of the License at
6344+ *
6345+ * http://www.apache.org/licenses/LICENSE-2.0
6346+ *
6347+ * Unless required by applicable law or agreed to in writing, software
6348+ * distributed under the License is distributed on an "AS IS" BASIS,
6349+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6350+ * See the License for the specific language governing permissions and
6351+ * limitations under the License.
6352+ */
6353+
6354+#ifndef MINDSPORE_NNACL_FP32_SUB_AVX512_H_
6355+#define MINDSPORE_NNACL_FP32_SUB_AVX512_H_
6356+
6357+#include "nnacl/intrinsics/ms_simd_instructions.h"
6358+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h"
6359+
6360+#ifdef __cplusplus
6361+extern "C" {
6362+#endif
6363+#pragma GCC push_options
6364+#pragma GCC target("avx512f")
6365+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION
6366+#define BLOCK_NUM 16
6367+#define MS_SIMD_AVX512
6368+
6369+static inline int ElementOptSubNum0AVX512(int index, const float *in0, const float *in1, float *out,
6370+                                                      int size) {
6371+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
6372+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6373+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
6374+    SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1);
6375+    SIMD_ST_F32(out + index, vout);
6376+  }
6377+  return index;
6378+}
6379+
6380+static inline int ElementOptSubNum1AVX512(int index, const float *in0, const float *in1, float *out,
6381+                                                      int size) {
6382+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
6383+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6384+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
6385+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_);
6386+    SIMD_ST_F32(out + index, vout);
6387+  }
6388+  return index;
6389+}
6390+
6391+static inline int ElementOptSubIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) {
6392+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
6393+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6394+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
6395+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1);
6396+    SIMD_ST_EPI32(out + index, vout);
6397+  }
6398+  return index;
6399+}
6400+
6401+static inline int ElementOptSubIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) {
6402+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
6403+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6404+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
6405+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_);
6406+    SIMD_ST_EPI32(out + index, vout);
6407+  }
6408+  return index;
6409+}
6410+
6411+static inline int ElementOptSubReluNum0AVX512(int index, const float *in0, const float *in1, float *out,
6412+                                                          int size) {
6413+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
6414+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6415+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
6416+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f);
6417+    SIMD_ST_F32(out + index, vout);
6418+  }
6419+  return index;
6420+}
6421+
6422+static inline int ElementOptSubReluNum1AVX512(int index, const float *in0, const float *in1, float *out,
6423+                                                          int size) {
6424+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
6425+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6426+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
6427+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f);
6428+    SIMD_ST_F32(out + index, vout);
6429+  }
6430+  return index;
6431+}
6432+
6433+static inline int ElementOptSubRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out,
6434+                                                           int size) {
6435+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
6436+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6437+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
6438+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f);
6439+    SIMD_ST_F32(out + index, vout);
6440+  }
6441+  return index;
6442+}
6443+
6444+static inline int ElementOptSubRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out,
6445+                                                           int size) {
6446+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
6447+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6448+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
6449+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f);
6450+    SIMD_ST_F32(out + index, vout);
6451+  }
6452+  return index;
6453+}
6454+
6455+static inline int ElementSubAVX512(int index, const float *in0, const float *in1, float *out, int size) {
6456+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6457+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
6458+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
6459+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1);
6460+    SIMD_ST_F32(out + index, vout);
6461+  }
6462+  return index;
6463+}
6464+
6465+static inline int ElementSubIntAVX512(int index, const int *in0, const int *in1, int *out, int size) {
6466+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6467+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
6468+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
6469+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1);
6470+    SIMD_ST_EPI32(out + index, vout);
6471+  }
6472+  return index;
6473+}
6474+
6475+static inline int ElementSubReluAVX512(int index, const float *in0, const float *in1, float *out,
6476+                                                   int size) {
6477+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6478+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
6479+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
6480+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f);
6481+    SIMD_ST_F32(out + index, vout);
6482+  }
6483+  return index;
6484+}
6485+
6486+static inline int ElementSubRelu6AVX512(int index, const float *in0, const float *in1, float *out,
6487+                                                    int size) {
6488+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
6489+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
6490+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
6491+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f);
6492+    SIMD_ST_F32(out + index, vout);
6493+  }
6494+  return index;
6495+}
6496+
6497+#undef MS_SIMD_INSTRUCTION
6498+#undef BLOCK_NUM
6499+#pragma GCC pop_options
6500+#undef MS_SIMD_AVX512
6501+#ifdef __cplusplus
6502+};
6503+#endif
6504+#endif
6505diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h
6506new file mode 100644
6507index 00000000..88908c90
6508--- /dev/null
6509+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h
6510@@ -0,0 +1,36 @@
6511+/**
6512+ * Copyright 2022 Huawei Technologies Co., Ltd
6513+ *
6514+ * Licensed under the Apache License, Version 2.0 (the "License");
6515+ * you may not use this file except in compliance with the License.
6516+ * You may obtain a copy of the License at
6517+ *
6518+ * http://www.apache.org/licenses/LICENSE-2.0
6519+ *
6520+ * Unless required by applicable law or agreed to in writing, software
6521+ * distributed under the License is distributed on an "AS IS" BASIS,
6522+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6523+ * See the License for the specific language governing permissions and
6524+ * limitations under the License.
6525+ */
6526+#ifndef MINDSPORE_NNACL_BATCHNORM_FP32_SIMD_H_
6527+#define MINDSPORE_NNACL_BATCHNORM_FP32_SIMD_H_
6528+
6529+#include "nnacl/intrinsics/ms_simd_instructions.h"
6530+#ifdef ENABLE_AVX512
6531+#include "nnacl/avx512/batchnorm_fp32_avx512.h"
6532+#endif
6533+
6534+#ifdef ENABLE_AVX
6535+#include "nnacl/avx/batchnorm_fp32_avx.h"
6536+#endif
6537+
6538+#ifdef ENABLE_SSE
6539+#include "nnacl/sse/batchnorm_fp32_sse.h"
6540+#endif
6541+
6542+#ifdef ENABLE_ARM
6543+#include "nnacl/neon/batchnorm_fp32_neon.h"
6544+#endif
6545+
6546+#endif
6547diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h
6548new file mode 100644
6549index 00000000..f36981ab
6550--- /dev/null
6551+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h
6552@@ -0,0 +1,36 @@
6553+/**
6554+ * Copyright 2022 Huawei Technologies Co., Ltd
6555+ *
6556+ * Licensed under the Apache License, Version 2.0 (the "License");
6557+ * you may not use this file except in compliance with the License.
6558+ * You may obtain a copy of the License at
6559+ *
6560+ * http://www.apache.org/licenses/LICENSE-2.0
6561+ *
6562+ * Unless required by applicable law or agreed to in writing, software
6563+ * distributed under the License is distributed on an "AS IS" BASIS,
6564+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6565+ * See the License for the specific language governing permissions and
6566+ * limitations under the License.
6567+ */
6568+#ifndef MINDSPORE_NNACL_BCE_WITH_LOGITS_LOSS_FP32_SIMD_H_
6569+#define MINDSPORE_NNACL_BCE_WITH_LOGITS_LOSS_FP32_SIMD_H_
6570+
6571+#include "nnacl/intrinsics/ms_simd_instructions.h"
6572+#ifdef ENABLE_AVX512
6573+#include "nnacl/avx512/bce_with_logits_loss_fp32_avx512.h"
6574+#endif
6575+
6576+#ifdef ENABLE_AVX
6577+#include "nnacl/avx/bce_with_logits_loss_fp32_avx.h"
6578+#endif
6579+
6580+#ifdef ENABLE_SSE
6581+#include "nnacl/sse/bce_with_logits_loss_fp32_sse.h"
6582+#endif
6583+
6584+#ifdef ENABLE_ARM
6585+#include "nnacl/neon/bce_with_logits_loss_fp32_neon.h"
6586+#endif
6587+
6588+#endif
6589diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h
6590new file mode 100644
6591index 00000000..e765b1eb
6592--- /dev/null
6593+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h
6594@@ -0,0 +1,36 @@
6595+/**
6596+ * Copyright 2022 Huawei Technologies Co., Ltd
6597+ *
6598+ * Licensed under the Apache License, Version 2.0 (the "License");
6599+ * you may not use this file except in compliance with the License.
6600+ * You may obtain a copy of the License at
6601+ *
6602+ * http://www.apache.org/licenses/LICENSE-2.0
6603+ *
6604+ * Unless required by applicable law or agreed to in writing, software
6605+ * distributed under the License is distributed on an "AS IS" BASIS,
6606+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6607+ * See the License for the specific language governing permissions and
6608+ * limitations under the License.
6609+ */
6610+#ifndef MINDSPORE_NNACL_BIAS_ADD_SIMD_H_
6611+#define MINDSPORE_NNACL_BIAS_ADD_SIMD_H_
6612+
6613+#include "nnacl/intrinsics/ms_simd_instructions.h"
6614+#ifdef ENABLE_AVX512
6615+#include "nnacl/avx512/bias_add_avx512.h"
6616+#endif
6617+
6618+#ifdef ENABLE_AVX
6619+#include "nnacl/avx/bias_add_avx.h"
6620+#endif
6621+
6622+#ifdef ENABLE_SSE
6623+#include "nnacl/sse/bias_add_sse.h"
6624+#endif
6625+
6626+#ifdef ENABLE_ARM
6627+#include "nnacl/neon/bias_add_neon.h"
6628+#endif
6629+
6630+#endif
6631diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h
6632new file mode 100644
6633index 00000000..93d8ca33
6634--- /dev/null
6635+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h
6636@@ -0,0 +1,36 @@
6637+/**
6638+ * Copyright 2022 Huawei Technologies Co., Ltd
6639+ *
6640+ * Licensed under the Apache License, Version 2.0 (the "License");
6641+ * you may not use this file except in compliance with the License.
6642+ * You may obtain a copy of the License at
6643+ *
6644+ * http://www.apache.org/licenses/LICENSE-2.0
6645+ *
6646+ * Unless required by applicable law or agreed to in writing, software
6647+ * distributed under the License is distributed on an "AS IS" BASIS,
6648+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6649+ * See the License for the specific language governing permissions and
6650+ * limitations under the License.
6651+ */
6652+#ifndef MINDSPORE_NNACL_CAST_BASE_SIMD_H_
6653+#define MINDSPORE_NNACL_CAST_BASE_SIMD_H_
6654+
6655+#include "nnacl/intrinsics/ms_simd_instructions.h"
6656+#ifdef ENABLE_AVX512
6657+#include "nnacl/avx512/cast_base_avx512.h"
6658+#endif
6659+
6660+#ifdef ENABLE_AVX
6661+#include "nnacl/avx/cast_base_avx.h"
6662+#endif
6663+
6664+#ifdef ENABLE_SSE
6665+#include "nnacl/sse/cast_base_sse.h"
6666+#endif
6667+
6668+#ifdef ENABLE_ARM
6669+#include "nnacl/neon/cast_base_neon.h"
6670+#endif
6671+
6672+#endif
6673diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h
6674new file mode 100644
6675index 00000000..70f79645
6676--- /dev/null
6677+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h
6678@@ -0,0 +1,36 @@
6679+/**
6680+ * Copyright 2022 Huawei Technologies Co., Ltd
6681+ *
6682+ * Licensed under the Apache License, Version 2.0 (the "License");
6683+ * you may not use this file except in compliance with the License.
6684+ * You may obtain a copy of the License at
6685+ *
6686+ * http://www.apache.org/licenses/LICENSE-2.0
6687+ *
6688+ * Unless required by applicable law or agreed to in writing, software
6689+ * distributed under the License is distributed on an "AS IS" BASIS,
6690+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6691+ * See the License for the specific language governing permissions and
6692+ * limitations under the License.
6693+ */
6694+#ifndef MINDSPORE_NNACL_CDIST_FP32_SIMD_H_
6695+#define MINDSPORE_NNACL_CDIST_FP32_SIMD_H_
6696+
6697+#include "nnacl/intrinsics/ms_simd_instructions.h"
6698+#ifdef ENABLE_AVX512
6699+#include "nnacl/avx512/cdist_fp32_avx512.h"
6700+#endif
6701+
6702+#ifdef ENABLE_AVX
6703+#include "nnacl/avx/cdist_fp32_avx.h"
6704+#endif
6705+
6706+#ifdef ENABLE_SSE
6707+#include "nnacl/sse/cdist_fp32_sse.h"
6708+#endif
6709+
6710+#ifdef ENABLE_ARM
6711+#include "nnacl/neon/cdist_fp32_neon.h"
6712+#endif
6713+
6714+#endif
6715diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h
6716new file mode 100644
6717index 00000000..b6979626
6718--- /dev/null
6719+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h
6720@@ -0,0 +1,36 @@
6721+/**
6722+ * Copyright 2022 Huawei Technologies Co., Ltd
6723+ *
6724+ * Licensed under the Apache License, Version 2.0 (the "License");
6725+ * you may not use this file except in compliance with the License.
6726+ * You may obtain a copy of the License at
6727+ *
6728+ * http://www.apache.org/licenses/LICENSE-2.0
6729+ *
6730+ * Unless required by applicable law or agreed to in writing, software
6731+ * distributed under the License is distributed on an "AS IS" BASIS,
6732+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6733+ * See the License for the specific language governing permissions and
6734+ * limitations under the License.
6735+ */
6736+#ifndef MINDSPORE_NNACL_CUMSUM_FP32_SIMD_H_
6737+#define MINDSPORE_NNACL_CUMSUM_FP32_SIMD_H_
6738+
6739+#include "nnacl/intrinsics/ms_simd_instructions.h"
6740+#ifdef ENABLE_AVX512
6741+#include "nnacl/avx512/cumsum_fp32_avx512.h"
6742+#endif
6743+
6744+#ifdef ENABLE_AVX
6745+#include "nnacl/avx/cumsum_fp32_avx.h"
6746+#endif
6747+
6748+#ifdef ENABLE_SSE
6749+#include "nnacl/sse/cumsum_fp32_sse.h"
6750+#endif
6751+
6752+#ifdef ENABLE_ARM
6753+#include "nnacl/neon/cumsum_fp32_neon.h"
6754+#endif
6755+
6756+#endif
6757diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h
6758new file mode 100644
6759index 00000000..dcae16ff
6760--- /dev/null
6761+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h
6762@@ -0,0 +1,36 @@
6763+/**
6764+ * Copyright 2022 Huawei Technologies Co., Ltd
6765+ *
6766+ * Licensed under the Apache License, Version 2.0 (the "License");
6767+ * you may not use this file except in compliance with the License.
6768+ * You may obtain a copy of the License at
6769+ *
6770+ * http://www.apache.org/licenses/LICENSE-2.0
6771+ *
6772+ * Unless required by applicable law or agreed to in writing, software
6773+ * distributed under the License is distributed on an "AS IS" BASIS,
6774+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6775+ * See the License for the specific language governing permissions and
6776+ * limitations under the License.
6777+ */
6778+#ifndef MINDSPORE_NNACL_DIV_FP32_SIMD_H_
6779+#define MINDSPORE_NNACL_DIV_FP32_SIMD_H_
6780+
6781+#include "nnacl/intrinsics/ms_simd_instructions.h"
6782+#ifdef ENABLE_AVX512
6783+#include "nnacl/avx512/div_fp32_avx512.h"
6784+#endif
6785+
6786+#ifdef ENABLE_AVX
6787+#include "nnacl/avx/div_fp32_avx.h"
6788+#endif
6789+
6790+#ifdef ENABLE_SSE
6791+#include "nnacl/sse/div_fp32_sse.h"
6792+#endif
6793+
6794+#ifdef ENABLE_ARM
6795+#include "nnacl/neon/div_fp32_neon.h"
6796+#endif
6797+
6798+#endif
6799diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h
6800new file mode 100644
6801index 00000000..704591c5
6802--- /dev/null
6803+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h
6804@@ -0,0 +1,36 @@
6805+/**
6806+ * Copyright 2022 Huawei Technologies Co., Ltd
6807+ *
6808+ * Licensed under the Apache License, Version 2.0 (the "License");
6809+ * you may not use this file except in compliance with the License.
6810+ * You may obtain a copy of the License at
6811+ *
6812+ * http://www.apache.org/licenses/LICENSE-2.0
6813+ *
6814+ * Unless required by applicable law or agreed to in writing, software
6815+ * distributed under the License is distributed on an "AS IS" BASIS,
6816+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6817+ * See the License for the specific language governing permissions and
6818+ * limitations under the License.
6819+ */
6820+#ifndef MINDSPORE_NNACL_DROPOUT_FP32_SIMD_H_
6821+#define MINDSPORE_NNACL_DROPOUT_FP32_SIMD_H_
6822+
6823+#include "nnacl/intrinsics/ms_simd_instructions.h"
6824+#ifdef ENABLE_AVX512
6825+#include "nnacl/avx512/dropout_fp32_avx512.h"
6826+#endif
6827+
6828+#ifdef ENABLE_AVX
6829+#include "nnacl/avx/dropout_fp32_avx.h"
6830+#endif
6831+
6832+#ifdef ENABLE_SSE
6833+#include "nnacl/sse/dropout_fp32_sse.h"
6834+#endif
6835+
6836+#ifdef ENABLE_ARM
6837+#include "nnacl/neon/dropout_fp32_neon.h"
6838+#endif
6839+
6840+#endif
6841diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h
6842new file mode 100644
6843index 00000000..272f5934
6844--- /dev/null
6845+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h
6846@@ -0,0 +1,36 @@
6847+/**
6848+ * Copyright 2022 Huawei Technologies Co., Ltd
6849+ *
6850+ * Licensed under the Apache License, Version 2.0 (the "License");
6851+ * you may not use this file except in compliance with the License.
6852+ * You may obtain a copy of the License at
6853+ *
6854+ * http://www.apache.org/licenses/LICENSE-2.0
6855+ *
6856+ * Unless required by applicable law or agreed to in writing, software
6857+ * distributed under the License is distributed on an "AS IS" BASIS,
6858+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6859+ * See the License for the specific language governing permissions and
6860+ * limitations under the License.
6861+ */
6862+#ifndef MINDSPORE_NNACL_EXP_FP32_SIMD_H_
6863+#define MINDSPORE_NNACL_EXP_FP32_SIMD_H_
6864+
6865+#include "nnacl/intrinsics/ms_simd_instructions.h"
6866+#ifdef ENABLE_AVX512
6867+#include "nnacl/avx512/exp_fp32_avx512.h"
6868+#endif
6869+
6870+#ifdef ENABLE_AVX
6871+#include "nnacl/avx/exp_fp32_avx.h"
6872+#endif
6873+
6874+#ifdef ENABLE_SSE
6875+#include "nnacl/sse/exp_fp32_sse.h"
6876+#endif
6877+
6878+#ifdef ENABLE_ARM
6879+#include "nnacl/neon/exp_fp32_neon.h"
6880+#endif
6881+
6882+#endif
6883diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h
6884new file mode 100644
6885index 00000000..f3099405
6886--- /dev/null
6887+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h
6888@@ -0,0 +1,36 @@
6889+/**
6890+ * Copyright 2022 Huawei Technologies Co., Ltd
6891+ *
6892+ * Licensed under the Apache License, Version 2.0 (the "License");
6893+ * you may not use this file except in compliance with the License.
6894+ * You may obtain a copy of the License at
6895+ *
6896+ * http://www.apache.org/licenses/LICENSE-2.0
6897+ *
6898+ * Unless required by applicable law or agreed to in writing, software
6899+ * distributed under the License is distributed on an "AS IS" BASIS,
6900+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6901+ * See the License for the specific language governing permissions and
6902+ * limitations under the License.
6903+ */
6904+#ifndef MINDSPORE_NNACL_FILL_BASE_SIMD_H_
6905+#define MINDSPORE_NNACL_FILL_BASE_SIMD_H_
6906+
6907+#include "nnacl/intrinsics/ms_simd_instructions.h"
6908+#ifdef ENABLE_AVX512
6909+#include "nnacl/avx512/fill_base_avx512.h"
6910+#endif
6911+
6912+#ifdef ENABLE_AVX
6913+#include "nnacl/avx/fill_base_avx.h"
6914+#endif
6915+
6916+#ifdef ENABLE_SSE
6917+#include "nnacl/sse/fill_base_sse.h"
6918+#endif
6919+
6920+#ifdef ENABLE_ARM
6921+#include "nnacl/neon/fill_base_neon.h"
6922+#endif
6923+
6924+#endif
6925diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h
6926new file mode 100644
6927index 00000000..a3931c20
6928--- /dev/null
6929+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h
6930@@ -0,0 +1,36 @@
6931+/**
6932+ * Copyright 2022 Huawei Technologies Co., Ltd
6933+ *
6934+ * Licensed under the Apache License, Version 2.0 (the "License");
6935+ * you may not use this file except in compliance with the License.
6936+ * You may obtain a copy of the License at
6937+ *
6938+ * http://www.apache.org/licenses/LICENSE-2.0
6939+ *
6940+ * Unless required by applicable law or agreed to in writing, software
6941+ * distributed under the License is distributed on an "AS IS" BASIS,
6942+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6943+ * See the License for the specific language governing permissions and
6944+ * limitations under the License.
6945+ */
6946+#ifndef MINDSPORE_NNACL_GROUP_NORM_FP32_SIMD_H_
6947+#define MINDSPORE_NNACL_GROUP_NORM_FP32_SIMD_H_
6948+
6949+#include "nnacl/intrinsics/ms_simd_instructions.h"
6950+#ifdef ENABLE_AVX512
6951+#include "nnacl/avx512/group_norm_fp32_avx512.h"
6952+#endif
6953+
6954+#ifdef ENABLE_AVX
6955+#include "nnacl/avx/group_norm_fp32_avx.h"
6956+#endif
6957+
6958+#ifdef ENABLE_SSE
6959+#include "nnacl/sse/group_norm_fp32_sse.h"
6960+#endif
6961+
6962+#ifdef ENABLE_ARM
6963+#include "nnacl/neon/group_norm_fp32_neon.h"
6964+#endif
6965+
6966+#endif
6967diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h
6968new file mode 100644
6969index 00000000..c08461d3
6970--- /dev/null
6971+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h
6972@@ -0,0 +1,36 @@
6973+/**
6974+ * Copyright 2022 Huawei Technologies Co., Ltd
6975+ *
6976+ * Licensed under the Apache License, Version 2.0 (the "License");
6977+ * you may not use this file except in compliance with the License.
6978+ * You may obtain a copy of the License at
6979+ *
6980+ * http://www.apache.org/licenses/LICENSE-2.0
6981+ *
6982+ * Unless required by applicable law or agreed to in writing, software
6983+ * distributed under the License is distributed on an "AS IS" BASIS,
6984+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
6985+ * See the License for the specific language governing permissions and
6986+ * limitations under the License.
6987+ */
6988+#ifndef MINDSPORE_NNACL_LAYER_NORM_FP32_SIMD_H_
6989+#define MINDSPORE_NNACL_LAYER_NORM_FP32_SIMD_H_
6990+
6991+#include "nnacl/intrinsics/ms_simd_instructions.h"
6992+#ifdef ENABLE_AVX512
6993+#include "nnacl/avx512/layer_norm_fp32_avx512.h"
6994+#endif
6995+
6996+#ifdef ENABLE_AVX
6997+#include "nnacl/avx/layer_norm_fp32_avx.h"
6998+#endif
6999+
7000+#ifdef ENABLE_SSE
7001+#include "nnacl/sse/layer_norm_fp32_sse.h"
7002+#endif
7003+
7004+#ifdef ENABLE_ARM
7005+#include "nnacl/neon/layer_norm_fp32_neon.h"
7006+#endif
7007+
7008+#endif
7009diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h
7010new file mode 100644
7011index 00000000..1250f3fc
7012--- /dev/null
7013+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h
7014@@ -0,0 +1,36 @@
7015+/**
7016+ * Copyright 2022 Huawei Technologies Co., Ltd
7017+ *
7018+ * Licensed under the Apache License, Version 2.0 (the "License");
7019+ * you may not use this file except in compliance with the License.
7020+ * You may obtain a copy of the License at
7021+ *
7022+ * http://www.apache.org/licenses/LICENSE-2.0
7023+ *
7024+ * Unless required by applicable law or agreed to in writing, software
7025+ * distributed under the License is distributed on an "AS IS" BASIS,
7026+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7027+ * See the License for the specific language governing permissions and
7028+ * limitations under the License.
7029+ */
7030+#ifndef MINDSPORE_NNACL_MATMUL_FP32_SIMD_H_
7031+#define MINDSPORE_NNACL_MATMUL_FP32_SIMD_H_
7032+
7033+#include "nnacl/intrinsics/ms_simd_instructions.h"
7034+#ifdef ENABLE_AVX512
7035+#include "nnacl/avx512/matmul_fp32_avx512.h"
7036+#endif
7037+
7038+#ifdef ENABLE_AVX
7039+#include "nnacl/avx/matmul_fp32_avx.h"
7040+#endif
7041+
7042+#ifdef ENABLE_SSE
7043+#include "nnacl/sse/matmul_fp32_sse.h"
7044+#endif
7045+
7046+#ifdef ENABLE_ARM
7047+#include "nnacl/neon/matmul_fp32_neon.h"
7048+#endif
7049+
7050+#endif
7051diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h
7052new file mode 100644
7053index 00000000..31e08b08
7054--- /dev/null
7055+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h
7056@@ -0,0 +1,36 @@
7057+/**
7058+ * Copyright 2022 Huawei Technologies Co., Ltd
7059+ *
7060+ * Licensed under the Apache License, Version 2.0 (the "License");
7061+ * you may not use this file except in compliance with the License.
7062+ * You may obtain a copy of the License at
7063+ *
7064+ * http://www.apache.org/licenses/LICENSE-2.0
7065+ *
7066+ * Unless required by applicable law or agreed to in writing, software
7067+ * distributed under the License is distributed on an "AS IS" BASIS,
7068+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7069+ * See the License for the specific language governing permissions and
7070+ * limitations under the License.
7071+ */
7072+#ifndef MINDSPORE_NNACL_MUL_FP32_SIMD_H_
7073+#define MINDSPORE_NNACL_MUL_FP32_SIMD_H_
7074+
7075+#include "nnacl/intrinsics/ms_simd_instructions.h"
7076+#ifdef ENABLE_AVX512
7077+#include "nnacl/avx512/mul_fp32_avx512.h"
7078+#endif
7079+
7080+#ifdef ENABLE_AVX
7081+#include "nnacl/avx/mul_fp32_avx.h"
7082+#endif
7083+
7084+#ifdef ENABLE_SSE
7085+#include "nnacl/sse/mul_fp32_sse.h"
7086+#endif
7087+
7088+#ifdef ENABLE_ARM
7089+#include "nnacl/neon/mul_fp32_neon.h"
7090+#endif
7091+
7092+#endif
7093diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h
7094new file mode 100644
7095index 00000000..42d163f6
7096--- /dev/null
7097+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h
7098@@ -0,0 +1,220 @@
7099+/**
7100+ * Copyright 2022 Huawei Technologies Co., Ltd
7101+ *
7102+ * Licensed under the Apache License, Version 2.0 (the "License");
7103+ * you may not use this file except in compliance with the License.
7104+ * You may obtain a copy of the License at
7105+ *
7106+ * http://www.apache.org/licenses/LICENSE-2.0
7107+ *
7108+ * Unless required by applicable law or agreed to in writing, software
7109+ * distributed under the License is distributed on an "AS IS" BASIS,
7110+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7111+ * See the License for the specific language governing permissions and
7112+ * limitations under the License.
7113+ */
7114+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
7115+#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
7116+
7117+#include "nnacl/intrinsics/ms_simd_instructions.h"
7118+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
7119+
7120+#ifdef __cplusplus
7121+extern "C" {
7122+#endif
7123+
7124+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
7125+#define BLOCK_NUM 4
7126+#define MS_SIMD_NEON
7127+
7128+static inline int Fp32ReluNEON(int index, const float *src, int length, float *dst) {
7129+    SIMD_F32 zero = SIMD_SET0_F32;
7130+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7131+        SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero));
7132+    }
7133+    return index;
7134+}
7135+
7136+static inline int Int32ReluNEON(int index, const int32_t *src, int length, int32_t *dst) {
7137+    SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f);
7138+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7139+        SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero));
7140+    }
7141+    return index;
7142+}
7143+
7144+static inline int Fp32Relu6NEON(int index, const float *src, int length, float *dst) {
7145+    SIMD_F32 zero = SIMD_SET0_F32;
7146+    SIMD_F32 six = SIMD_MOV_F32(6.0f);
7147+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7148+        SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six));
7149+    }
7150+    return index;
7151+}
7152+
7153+static inline int LReluNEON(int index, const float *src, int length, float *dst, float alpha) {
7154+    SIMD_F32 alpha_data = SIMD_MOV_F32(alpha);
7155+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7156+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
7157+        SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp);
7158+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask));
7159+    }
7160+    return index;
7161+}
7162+
7163+static inline int SigmoidNEON(int index, const float *src, int length, float *dst) {
7164+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7165+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index);
7166+        SIMD_ST_F32(dst + index,
7167+                    SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
7168+    }
7169+    return index;
7170+}
7171+
7172+static inline int TanhNEON(int index, const float *src, int length, float *dst) {
7173+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7174+        SIMD_F32 input = SIMD_LD_F32(src + index);
7175+        SIMD_ST_F32(dst + index, SIMD_TANH_F32(input));
7176+    }
7177+    return index;
7178+}
7179+
7180+static inline int SwishNEON(int index, const float *src, int length, float *dst) {
7181+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7182+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
7183+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index);
7184+        SIMD_ST_F32(dst + index,
7185+                    SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
7186+    }
7187+    return index;
7188+}
7189+
7190+static inline int HSwishNEON(int index, const float *src, int length, float *dst) {
7191+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7192+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
7193+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
7194+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6));
7195+    }
7196+    return index;
7197+}
7198+
7199+static inline int HSigmoidNEON(int index, const float *src, int length, float *dst) {
7200+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7201+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
7202+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
7203+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6));
7204+    }
7205+    return index;
7206+}
7207+
7208+static inline int HardTanhNoLimitMinNEON(int index, const float *src, int length, float *dst, float min_val,
7209+                                            float max_val) {
7210+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7211+        SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val));
7212+    }
7213+    return index;
7214+}
7215+
7216+static inline int HardTanhNoLimitMaxNEON(int index, const float *src, int length, float *dst, float min_val,
7217+                                            float max_val) {
7218+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7219+        SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val));
7220+    }
7221+    return index;
7222+}
7223+
7224+static inline int HardTanhLimitMinMaxNEON(int index, const float *src, int length, float *dst, float min_val,
7225+                                             float max_val) {
7226+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7227+        SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val));
7228+    }
7229+    return index;
7230+}
7231+
7232+static inline int GeluApproximateNEON(int index, const float *src, int length, float *dst) {
7233+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7234+        SIMD_F32 in = SIMD_LD_F32(src + index);
7235+        SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in);
7236+        SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in);
7237+        SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f)));
7238+    }
7239+    return index;
7240+}
7241+
7242+static inline int GeluNEON(int index, const float *src, int length, float *dst) {
7243+    SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f);
7244+    SIMD_F32 para2 = SIMD_MOV_F32(1.0f);
7245+    SIMD_F32 para3 = SIMD_MOV_F32(0.5f);
7246+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7247+      SIMD_F32 in = SIMD_LD_F32(src + index);
7248+      SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1))));
7249+      SIMD_ST_F32(dst + index, res);
7250+    }
7251+    return index;
7252+}
7253+
7254+static inline int EluNEON(int index, const float *src, int length, float *dst, float alpha) {
7255+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7256+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
7257+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f);
7258+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
7259+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
7260+    }
7261+    return index;
7262+}
7263+
7264+static inline int CeluNEON(int index, const float *src, int length, float *dst, float alpha) {
7265+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7266+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
7267+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f);
7268+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
7269+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
7270+    }
7271+    return index;
7272+}
7273+
7274+static inline int HShrinkNEON(int index, const float *src, int length, float *dst, float lambd) {
7275+    const float neg_lambd = -1 * lambd;
7276+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7277+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
7278+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd));
7279+        SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp);
7280+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
7281+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask));
7282+    }
7283+    return index;
7284+}
7285+
7286+static inline int SoftShrinkNEON(int index, const float *src, int length, float *dst, float lambd) {
7287+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
7288+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
7289+
7290+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7291+        SIMD_F32 src_t = SIMD_LD_F32(src + index);
7292+        /* v0 = (in > lamdb) & (in - lamdb) */
7293+        SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v));
7294+        /* v1 = (in < -lamdb) & (in + lamdb) */
7295+        SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v));
7296+        /* out = (v0 | v1) */
7297+        SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1));
7298+    }
7299+    return index;
7300+}
7301+
7302+static inline int SoftsignFp32OptNEON(int index, const float *src, int length, float *dst) {
7303+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7304+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
7305+        SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp));
7306+        SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp));
7307+    }
7308+    return index;
7309+}
7310+
7311+#undef MS_SIMD_INSTRUCTION
7312+#undef BLOCK_NUM
7313+
7314+#undef MS_SIMD_NEON
7315+#ifdef __cplusplus
7316+}
7317+#endif
7318+#endif
7319diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h
7320new file mode 100644
7321index 00000000..df832e51
7322--- /dev/null
7323+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h
7324@@ -0,0 +1,56 @@
7325+/**
7326+ * Copyright 2022 Huawei Technologies Co., Ltd
7327+ *
7328+ * Licensed under the Apache License, Version 2.0 (the "License");
7329+ * you may not use this file except in compliance with the License.
7330+ * You may obtain a copy of the License at
7331+ *
7332+ * http://www.apache.org/licenses/LICENSE-2.0
7333+ *
7334+ * Unless required by applicable law or agreed to in writing, software
7335+ * distributed under the License is distributed on an "AS IS" BASIS,
7336+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7337+ * See the License for the specific language governing permissions and
7338+ * limitations under the License.
7339+ */
7340+
7341+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_NEON_H_
7342+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_NEON_H_
7343+
7344+#include "nnacl/intrinsics/ms_simd_instructions.h"
7345+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
7346+
7347+#ifdef __cplusplus
7348+extern "C" {
7349+#endif
7350+
7351+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
7352+#define BLOCK_NUM 4
7353+#define MS_SIMD_NEON
7354+
7355+static inline int ShrinkGradNEON(int index, const float *src0, const float *src1,
7356+                                               int length, float *dst, float lambd) {
7357+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
7358+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
7359+
7360+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7361+        SIMD_F32 src0_t = SIMD_LD_F32(src0 + index);
7362+        SIMD_F32 src1_t = SIMD_LD_F32(src1 + index);
7363+
7364+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v);
7365+        SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t);
7366+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
7367+
7368+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask));
7369+    }
7370+    return index;
7371+}
7372+
7373+#undef MS_SIMD_INSTRUCTION
7374+#undef BLOCK_NUM
7375+
7376+#undef MS_SIMD_NEON
7377+#ifdef __cplusplus
7378+}
7379+#endif
7380+#endif
7381diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h
7382new file mode 100644
7383index 00000000..fda41ec2
7384--- /dev/null
7385+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h
7386@@ -0,0 +1,209 @@
7387+/**
7388+ * Copyright 2022 Huawei Technologies Co., Ltd
7389+ *
7390+ * Licensed under the Apache License, Version 2.0 (the "License");
7391+ * you may not use this file except in compliance with the License.
7392+ * You may obtain a copy of the License at
7393+ *
7394+ * http://www.apache.org/licenses/LICENSE-2.0
7395+ *
7396+ * Unless required by applicable law or agreed to in writing, software
7397+ * distributed under the License is distributed on an "AS IS" BASIS,
7398+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7399+ * See the License for the specific language governing permissions and
7400+ * limitations under the License.
7401+ */
7402+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_NEON_H_
7403+#define MINDSPORE_NNACL_FP32_ADAM_FP32_NEON_H_
7404+
7405+#include "nnacl/intrinsics/ms_simd_instructions.h"
7406+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
7407+
7408+#ifdef __cplusplus
7409+extern "C" {
7410+#endif
7411+
7412+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
7413+#define BLOCK_NUM 4
7414+#define MS_SIMD_NEON
7415+#ifdef MS_SIMD_AVX512
7416+  static inline size_t AdamWeightDecayFp32NEON(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
7417+    const float *gradient, size_t end) {
7418+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
7419+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
7420+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
7421+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
7422+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
7423+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
7424+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
7425+
7426+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7427+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
7428+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
7429+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
7430+    SIMD_F32 g_r = SIMD_LD_F32(gradient + index);
7431+
7432+    m_r = SIMD_MUL_F32(m_r, beta1_r);
7433+    v_r = SIMD_MUL_F32(v_r, beta2_r);
7434+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
7435+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
7436+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
7437+    avx_r0 = SIMD_SQRT_F32(v_r);
7438+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
7439+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
7440+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
7441+    SIMD_ST_F32(m + index, m_r);
7442+    SIMD_ST_F32(v + index, v_r);
7443+    SIMD_ST_F32(var + index, var_r);
7444+  }
7445+
7446+  return index;
7447+}
7448+
7449+static inline size_t FusedCastAdamFp32Fp16NEON(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
7450+    float global_norm_reciprocal, size_t end) {
7451+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
7452+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
7453+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
7454+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
7455+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
7456+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
7457+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
7458+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
7459+
7460+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7461+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
7462+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
7463+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
7464+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
7465+
7466+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
7467+    m_r = SIMD_MUL_F32(m_r, beta1_r);
7468+    v_r = SIMD_MUL_F32(v_r, beta2_r);
7469+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
7470+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
7471+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
7472+    avx_r0 = SIMD_SQRT_F32(v_r);
7473+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
7474+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
7475+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
7476+    SIMD_ST_F32(var + index, var_r);
7477+    SIMD_ST_F32(m + index, m_r);
7478+    SIMD_ST_F32(v + index, v_r);
7479+  }
7480+
7481+  return index;
7482+}
7483+
7484+static inline size_t FusedCastAdamFp32Fp32NEON(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
7485+    float global_norm_reciprocal, size_t end) {
7486+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
7487+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
7488+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
7489+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
7490+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
7491+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
7492+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
7493+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
7494+
7495+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7496+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
7497+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
7498+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
7499+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
7500+
7501+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
7502+    m_r = SIMD_MUL_F32(m_r, beta1_r);
7503+    v_r = SIMD_MUL_F32(v_r, beta2_r);
7504+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
7505+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
7506+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
7507+    avx_r0 = SIMD_SQRT_F32(v_r);
7508+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
7509+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
7510+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
7511+    SIMD_ST_F32(var + index, var_r);
7512+    SIMD_ST_F32(m + index, m_r);
7513+    SIMD_ST_F32(v + index, v_r);
7514+  }
7515+
7516+  return index;
7517+}
7518+
7519+static inline size_t FusedCastAdamFp16Fp16NEON(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
7520+    float global_norm_reciprocal, size_t end) {
7521+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
7522+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
7523+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
7524+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
7525+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
7526+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
7527+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
7528+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
7529+
7530+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7531+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
7532+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
7533+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
7534+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
7535+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
7536+    m_r = SIMD_MUL_F32(m_r, beta1_r);
7537+    v_r = SIMD_MUL_F32(v_r, beta2_r);
7538+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
7539+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
7540+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
7541+    avx_r0 = SIMD_SQRT_F32(v_r);
7542+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
7543+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
7544+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
7545+    SIMD_ST_F32(m + index, m_r);
7546+    SIMD_ST_F32(v + index, v_r);
7547+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
7548+  }
7549+
7550+  return index;
7551+}
7552+
7553+static inline size_t FusedCastAdamFp16Fp32NEON(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
7554+    float global_norm_reciprocal, size_t end) {
7555+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
7556+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
7557+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
7558+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
7559+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
7560+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
7561+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
7562+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
7563+
7564+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7565+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
7566+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
7567+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
7568+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
7569+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
7570+    m_r = SIMD_MUL_F32(m_r, beta1_r);
7571+    v_r = SIMD_MUL_F32(v_r, beta2_r);
7572+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
7573+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
7574+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
7575+    avx_r0 = SIMD_SQRT_F32(v_r);
7576+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
7577+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
7578+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
7579+    SIMD_ST_F32(m + index, m_r);
7580+    SIMD_ST_F32(v + index, v_r);
7581+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
7582+  }
7583+
7584+  return index;
7585+}
7586+#endif
7587+
7588+#undef MS_SIMD_INSTRUCTION
7589+#undef BLOCK_NUM
7590+
7591+#undef MS_SIMD_NEON
7592+#ifdef __cplusplus
7593+}
7594+#endif
7595+#endif
7596diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h
7597new file mode 100644
7598index 00000000..4ef32418
7599--- /dev/null
7600+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h
7601@@ -0,0 +1,123 @@
7602+/**
7603+ * Copyright 2022 Huawei Technologies Co., Ltd
7604+ *
7605+ * Licensed under the Apache License, Version 2.0 (the "License");
7606+ * you may not use this file except in compliance with the License.
7607+ * You may obtain a copy of the License at
7608+ *
7609+ * http://www.apache.org/licenses/LICENSE-2.0
7610+ *
7611+ * Unless required by applicable law or agreed to in writing, software
7612+ * distributed under the License is distributed on an "AS IS" BASIS,
7613+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7614+ * See the License for the specific language governing permissions and
7615+ * limitations under the License.
7616+ */
7617+
7618+#ifndef MINDSPORE_NNACL_FP32_ADD_NEON_H_
7619+#define MINDSPORE_NNACL_FP32_ADD_NEON_H_
7620+
7621+#include "nnacl/intrinsics/ms_simd_instructions.h"
7622+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
7623+
7624+#ifdef __cplusplus
7625+extern "C" {
7626+#endif
7627+
7628+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
7629+#define BLOCK_NUM 4
7630+#define MS_SIMD_NEON
7631+
7632+static inline int ElementOptAddNEON(int index, const float *in0, const float *in1, float *out, int size) {
7633+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
7634+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7635+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
7636+    SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1);
7637+    SIMD_ST_F32(out + index, vout);
7638+  }
7639+  return index;
7640+}
7641+
7642+static inline int ElementOptAddIntNEON(int index, const int *in0, const int *in1, int *out,
7643+                                                     int size) {
7644+  SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]);
7645+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7646+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
7647+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1);
7648+    SIMD_ST_EPI32(out + index, vout);
7649+  }
7650+  return index;
7651+}
7652+
7653+static inline int ElementOptAddReluNEON(int index, const float *in0, const float *in1, float *out,
7654+                                                      int size) {
7655+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
7656+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7657+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
7658+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f);
7659+    SIMD_ST_F32(out + index, vout);
7660+  }
7661+  return index;
7662+}
7663+
7664+static inline int ElementOptAddRelu6NEON(int index, const float *in0, const float *in1, float *out,
7665+                                                       int size) {
7666+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
7667+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7668+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
7669+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f);
7670+    SIMD_ST_F32(out + index, vout);
7671+  }
7672+  return index;
7673+}
7674+
7675+static inline int ElementAddNEON(int index, const float *in0, const float *in1, float *out, int size) {
7676+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7677+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
7678+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
7679+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
7680+    SIMD_ST_F32(out + index, vout);
7681+  }
7682+  return index;
7683+}
7684+
7685+static inline int ElementAddReluNEON(int index, const float *in0, const float *in1, float *out,
7686+                                                   int size) {
7687+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7688+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
7689+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
7690+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f);
7691+    SIMD_ST_F32(out + index, vout);
7692+  }
7693+  return index;
7694+}
7695+
7696+static inline int ElementAddRelu6NEON(int index, const float *in0, const float *in1, float *out,
7697+                                                    int size) {
7698+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7699+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
7700+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
7701+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f);
7702+    SIMD_ST_F32(out + index, vout);
7703+  }
7704+  return index;
7705+}
7706+
7707+static inline int ElementAddIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
7708+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7709+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
7710+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
7711+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1);
7712+    SIMD_ST_EPI32(out + index, vout);
7713+  }
7714+  return index;
7715+}
7716+
7717+#undef MS_SIMD_INSTRUCTION
7718+#undef BLOCK_NUM
7719+
7720+#undef MS_SIMD_NEON
7721+#ifdef __cplusplus
7722+}
7723+#endif
7724+#endif
7725diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h
7726new file mode 100644
7727index 00000000..2449c07d
7728--- /dev/null
7729+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h
7730@@ -0,0 +1,253 @@
7731+/**
7732+ * Copyright 2022 Huawei Technologies Co., Ltd
7733+ *
7734+ * Licensed under the Apache License, Version 2.0 (the "License");
7735+ * you may not use this file except in compliance with the License.
7736+ * You may obtain a copy of the License at
7737+ *
7738+ * http://www.apache.org/licenses/LICENSE-2.0
7739+ *
7740+ * Unless required by applicable law or agreed to in writing, software
7741+ * distributed under the License is distributed on an "AS IS" BASIS,
7742+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7743+ * See the License for the specific language governing permissions and
7744+ * limitations under the License.
7745+ */
7746+
7747+#ifndef MINDSPORE_NNACL_ARITHMETIC_NEON_H_
7748+#define MINDSPORE_NNACL_ARITHMETIC_NEON_H_
7749+
7750+#include "nnacl/intrinsics/ms_simd_instructions.h"
7751+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
7752+
7753+#ifdef __cplusplus
7754+extern "C" {
7755+#endif
7756+
7757+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
7758+#define BLOCK_NUM 4
7759+#define MS_SIMD_NEON
7760+
7761+#ifndef MS_SIMD_NEON
7762+static inline int ElementFloorModNEON(int index, const float *in0, const float *in1, float *out, int size) {
7763+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7764+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
7765+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
7766+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
7767+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
7768+    SIMD_ST_F32(out + index, out_tmp);
7769+  }
7770+  return index;
7771+}
7772+
7773+static inline int ElementOptFloorModNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
7774+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
7775+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7776+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
7777+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
7778+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
7779+    SIMD_ST_F32(out + index, out_tmp);
7780+  }
7781+  return index;
7782+}
7783+
7784+static inline int ElementOptFloorModNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
7785+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
7786+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7787+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
7788+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
7789+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
7790+    SIMD_ST_F32(out + index, out_tmp);
7791+  }
7792+  return index;
7793+}
7794+
7795+static inline int ElementFloorDivNEON(int index, const float *in0, const float *in1, float *out, int size) {
7796+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7797+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
7798+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
7799+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
7800+    SIMD_ST_F32(out + index, floor_tmp);
7801+  }
7802+  return index;
7803+}
7804+
7805+static inline int ElementOptFloorDivNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
7806+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
7807+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7808+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
7809+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
7810+    SIMD_ST_F32(out + index, out_tmp);
7811+  }
7812+  return index;
7813+}
7814+
7815+static inline int ElementOptFloorDivNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
7816+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
7817+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7818+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
7819+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
7820+    SIMD_ST_F32(out + index, out_tmp);
7821+  }
7822+  return index;
7823+}
7824+#endif
7825+
7826+static inline int ElementFloorDivIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
7827+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7828+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
7829+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
7830+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
7831+    SIMD_ST_EPI32(out + index, out_tmp);
7832+  }
7833+  return index;
7834+}
7835+
7836+static inline int ElementOptFloorDivIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
7837+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
7838+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7839+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
7840+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
7841+    SIMD_ST_EPI32(out + index, out_tmp);
7842+  }
7843+  return index;
7844+}
7845+
7846+static inline int ElementOptFloorDivIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
7847+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
7848+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7849+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
7850+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
7851+    SIMD_ST_EPI32(out + index, out_tmp);
7852+  }
7853+  return index;
7854+}
7855+
7856+static inline int ElementMaximumNEON(int index, const float *in0, const float *in1, float *out, int size) {
7857+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7858+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
7859+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
7860+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
7861+    SIMD_ST_F32(out + index, out_tmp);
7862+  }
7863+  return index;
7864+}
7865+
7866+static inline int ElementOptMaximumNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
7867+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
7868+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7869+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
7870+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
7871+    SIMD_ST_F32(out + index, out_tmp);
7872+  }
7873+  return index;
7874+}
7875+
7876+static inline int ElementOptMaximumNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
7877+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
7878+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7879+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
7880+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
7881+    SIMD_ST_F32(out + index, out_tmp);
7882+  }
7883+  return index;
7884+}
7885+
7886+static inline int ElementMaximumIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
7887+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7888+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
7889+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
7890+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
7891+    SIMD_ST_EPI32(out + index, out_tmp);
7892+  }
7893+  return index;
7894+}
7895+
7896+static inline int ElementOptMaximumIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
7897+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
7898+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7899+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
7900+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
7901+    SIMD_ST_EPI32(out + index, out_tmp);
7902+  }
7903+  return index;
7904+}
7905+
7906+static inline int ElementOptMaximumIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
7907+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
7908+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7909+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
7910+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
7911+    SIMD_ST_EPI32(out + index, out_tmp);
7912+  }
7913+  return index;
7914+}
7915+
7916+static inline int ElementMinimumIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
7917+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7918+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
7919+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
7920+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
7921+    SIMD_ST_EPI32(out + index, out_tmp);
7922+  }
7923+  return index;
7924+}
7925+
7926+static inline int ElementOptMinimumIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
7927+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
7928+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7929+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
7930+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
7931+    SIMD_ST_EPI32(out + index, out_tmp);
7932+  }
7933+  return index;
7934+}
7935+
7936+static inline int ElementOptMinimumIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
7937+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
7938+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7939+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
7940+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
7941+    SIMD_ST_EPI32(out + index, out_tmp);
7942+  }
7943+  return index;
7944+}
7945+
7946+static inline int ElementMinimumNEON(int index, const float *in0, const float *in1, float *out, int size) {
7947+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7948+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
7949+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
7950+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
7951+    SIMD_ST_F32(out + index, out_tmp);
7952+  }
7953+  return index;
7954+}
7955+
7956+static inline int ElementOptMinimumNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
7957+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
7958+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7959+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
7960+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
7961+    SIMD_ST_F32(out + index, out_tmp);
7962+  }
7963+  return index;
7964+}
7965+
7966+static inline int ElementOptMinimumNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
7967+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
7968+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
7969+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
7970+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
7971+    SIMD_ST_F32(out + index, out_tmp);
7972+  }
7973+  return index;
7974+}
7975+
7976+#undef MS_SIMD_INSTRUCTION
7977+#undef BLOCK_NUM
7978+
7979+#undef MS_SIMD_NEON
7980+#ifdef __cplusplus
7981+}
7982+#endif
7983+#endif
7984diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h
7985new file mode 100644
7986index 00000000..682148d7
7987--- /dev/null
7988+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h
7989@@ -0,0 +1,128 @@
7990+/**
7991+ * Copyright 2022 Huawei Technologies Co., Ltd
7992+ *
7993+ * Licensed under the Apache License, Version 2.0 (the "License");
7994+ * you may not use this file except in compliance with the License.
7995+ * You may obtain a copy of the License at
7996+ *
7997+ * http://www.apache.org/licenses/LICENSE-2.0
7998+ *
7999+ * Unless required by applicable law or agreed to in writing, software
8000+ * distributed under the License is distributed on an "AS IS" BASIS,
8001+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8002+ * See the License for the specific language governing permissions and
8003+ * limitations under the License.
8004+ */
8005+
8006+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_NEON_H_
8007+#define MINDSPORE_NNACL_ARITHMETIC_SELF_NEON_H_
8008+
8009+#include "nnacl/intrinsics/ms_simd_instructions.h"
8010+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8011+
8012+#ifdef __cplusplus
8013+extern "C" {
8014+#endif
8015+
8016+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8017+#define BLOCK_NUM 4
8018+#define MS_SIMD_NEON
8019+
8020+#if defined(MS_SIMD_AVX512)
8021+// only avx512 support abs fp32 instruction
8022+static inline int ElementAbsNEON(int index, const float *input, float *output, const int element_size) {
8023+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8024+    SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index)));
8025+  }
8026+  return index;
8027+}
8028+
8029+static inline int ElementAbsIntNEON(int index, const int *input, int *output, const int element_size) {
8030+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8031+    SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index)));
8032+  }
8033+  return index;
8034+}
8035+#endif
8036+
8037+static inline int ElementSquareNEON(int index, const float *input, float *output, const int element_size) {
8038+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8039+    SIMD_F32 vin = SIMD_LD_F32(input + index);
8040+    SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin));
8041+  }
8042+  return index;
8043+}
8044+
8045+static inline int ElementSqrtNEON(int index, const float *input, float *output, const int element_size) {
8046+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8047+    SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index)));
8048+  }
8049+  return index;
8050+}
8051+
8052+static inline int ElementRsqrtNEON(int index, const float *input, float *output, const int element_size) {
8053+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8054+    SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index)));
8055+  }
8056+  return index;
8057+}
8058+
8059+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE)
8060+// avx512 dont support round fp32 instruction
8061+static inline int ElementRoundNEON(int index, const float *input, float *output, const int element_size) {
8062+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8063+    SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index)));
8064+  }
8065+  return index;
8066+}
8067+#endif
8068+
8069+#ifndef MS_SIMD_NEON
8070+// neon dont support floor fp32 instruction
8071+static inline int ElementFloorNEON(int index, const float *input, float *output, const int element_size) {
8072+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8073+    SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index)));
8074+  }
8075+  return index;
8076+}
8077+#endif
8078+
8079+#ifndef MS_SIMD_NEON
8080+static inline int ElementCeilNEON(int index, const float *input, float *output, const int element_size) {
8081+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8082+    SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index)));
8083+  }
8084+  return index;
8085+}
8086+#endif
8087+
8088+static inline int ElementNegativeNEON(int index, const float *input, float *output, const int element_size) {
8089+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8090+    SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f));
8091+  }
8092+  return index;
8093+}
8094+
8095+static inline int ElementNegativeIntNEON(int index, const int *input, int *output, const int element_size) {
8096+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8097+    SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1));
8098+  }
8099+  return index;
8100+}
8101+
8102+static inline int ElementReciprocalNEON(int index, const float *input, float *output, const int element_size) {
8103+  SIMD_F32 num1 = SIMD_MOV_F32(1.0f);
8104+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8105+    SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index)));
8106+  }
8107+  return index;
8108+}
8109+
8110+#undef MS_SIMD_INSTRUCTION
8111+#undef BLOCK_NUM
8112+
8113+#undef MS_SIMD_NEON
8114+#ifdef __cplusplus
8115+}
8116+#endif
8117+#endif
8118diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h
8119new file mode 100644
8120index 00000000..5e169d62
8121--- /dev/null
8122+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h
8123@@ -0,0 +1,66 @@
8124+/**
8125+ * Copyright 2022 Huawei Technologies Co., Ltd
8126+ *
8127+ * Licensed under the Apache License, Version 2.0 (the "License");
8128+ * you may not use this file except in compliance with the License.
8129+ * You may obtain a copy of the License at
8130+ *
8131+ * http://www.apache.org/licenses/LICENSE-2.0
8132+ *
8133+ * Unless required by applicable law or agreed to in writing, software
8134+ * distributed under the License is distributed on an "AS IS" BASIS,
8135+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8136+ * See the License for the specific language governing permissions and
8137+ * limitations under the License.
8138+ */
8139+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
8140+#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
8141+
8142+#include "nnacl/intrinsics/ms_simd_instructions.h"
8143+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8144+
8145+#ifdef __cplusplus
8146+extern "C" {
8147+#endif
8148+
8149+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8150+#define BLOCK_NUM 4
8151+#define MS_SIMD_NEON
8152+
8153+static inline int BatchNormFp32NEON(int index, const float *input, const float *mean,
8154+  const float *variance, int channel, float epsilon, float *output) {
8155+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8156+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
8157+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
8158+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
8159+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
8160+    SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
8161+    SIMD_ST_F32(output + index, output_data);
8162+  }
8163+  return index;
8164+}
8165+
8166+static inline int FusedBatchNormFp32NEON(int index, const float *input, const float *scale,
8167+  const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) {
8168+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8169+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
8170+    SIMD_F32 scale_ = SIMD_LD_F32(scale + index);
8171+    SIMD_F32 offset_ = SIMD_LD_F32(offset + index);
8172+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
8173+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
8174+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
8175+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
8176+    SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_);
8177+    SIMD_ST_F32(output + index, output_data);
8178+  }
8179+  return index;
8180+}
8181+
8182+#undef MS_SIMD_INSTRUCTION
8183+#undef BLOCK_NUM
8184+
8185+#undef MS_SIMD_NEON
8186+#ifdef __cplusplus
8187+}
8188+#endif
8189+#endif
8190diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h
8191new file mode 100644
8192index 00000000..3f52857c
8193--- /dev/null
8194+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h
8195@@ -0,0 +1,68 @@
8196+/**
8197+ * Copyright 2022 Huawei Technologies Co., Ltd
8198+ *
8199+ * Licensed under the Apache License, Version 2.0 (the "License");
8200+ * you may not use this file except in compliance with the License.
8201+ * You may obtain a copy of the License at
8202+ *
8203+ * http://www.apache.org/licenses/LICENSE-2.0
8204+ *
8205+ * Unless required by applicable law or agreed to in writing, software
8206+ * distributed under the License is distributed on an "AS IS" BASIS,
8207+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8208+ * See the License for the specific language governing permissions and
8209+ * limitations under the License.
8210+ */
8211+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_NEON_H_
8212+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_NEON_H_
8213+
8214+#include "nnacl/intrinsics/ms_simd_instructions.h"
8215+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8216+
8217+#ifdef __cplusplus
8218+extern "C" {
8219+#endif
8220+
8221+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8222+#define BLOCK_NUM 4
8223+#define MS_SIMD_NEON
8224+
8225+static inline int BCEWithLogitLossNEON(int index, const float *logits, const float *label,
8226+    const float *weight, const float *pos_weight, int length, bool reduction, float *output,
8227+    float *reduction_sum) {
8228+    SIMD_F32 zero = SIMD_SET0_F32;
8229+    SIMD_F32 ones = SIMD_MOV_F32(1.0f);
8230+    SIMD_F32 middle_output = SIMD_SET0_F32;
8231+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8232+      SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index);
8233+      SIMD_F32 label_tmp = SIMD_LD_F32(label + index);
8234+      SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index);
8235+      SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index);
8236+      SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp);
8237+      SIMD_F32 max_value = neg_logits_tmp;
8238+      max_value = SIMD_MIN_F32(max_value, zero);
8239+      SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value);
8240+      SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones);
8241+      SIMD_F32 log_exp_value =
8242+        SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value))));
8243+      SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp),
8244+                                    SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value)));
8245+      if (reduction) {
8246+        middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output);
8247+      } else {
8248+        SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp));
8249+      }
8250+    }
8251+    if (reduction) {
8252+      *reduction_sum += SIMD_GET_SUM_F32(middle_output);
8253+    }
8254+    return index;
8255+}
8256+#undef MS_SIMD_INSTRUCTION
8257+#undef BLOCK_NUM
8258+
8259+#undef MS_SIMD_NEON
8260+#ifdef __cplusplus
8261+}
8262+#endif
8263+#endif
8264diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h
8265new file mode 100644
8266index 00000000..afaf0de5
8267--- /dev/null
8268+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h
8269@@ -0,0 +1,63 @@
8270+/**
8271+ * Copyright 2022 Huawei Technologies Co., Ltd
8272+ *
8273+ * Licensed under the Apache License, Version 2.0 (the "License");
8274+ * you may not use this file except in compliance with the License.
8275+ * You may obtain a copy of the License at
8276+ *
8277+ * http://www.apache.org/licenses/LICENSE-2.0
8278+ *
8279+ * Unless required by applicable law or agreed to in writing, software
8280+ * distributed under the License is distributed on an "AS IS" BASIS,
8281+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8282+ * See the License for the specific language governing permissions and
8283+ * limitations under the License.
8284+ */
8285+
8286+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_NEON_H_
8287+#define MINDSPORE_NNACL_FP32_BIAS_ADD_NEON_H_
8288+
8289+#include "nnacl/intrinsics/ms_simd_instructions.h"
8290+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8291+
8292+#ifdef __cplusplus
8293+extern "C" {
8294+#endif
8295+
8296+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8297+#define BLOCK_NUM 4
8298+#define MS_SIMD_NEON
8299+
8300+static inline int BiasAddByInnerCoreNEON(int index, const float *input, const float *bias, float *output,
8301+                                                       int64_t num) {
8302+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8303+    SIMD_F32 vin0 = SIMD_LD_F32(input + index);
8304+    SIMD_F32 vin1 = SIMD_LD_F32(bias + index);
8305+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
8306+    SIMD_ST_F32(output + index, vout);
8307+  }
8308+  return index;
8309+}
8310+
8311+static inline int BiasAddByBatchCoreNEON(int index, const float *input, const float *bias, float *output1,
8312+                                                       float *output2, float *output3, float *output4, int64_t num) {
8313+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8314+    SIMD_LDX4_F32(input_data, input + index, num);
8315+    SIMD_F32 bias_data = SIMD_LD_F32(bias + index);
8316+    SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data));
8317+    SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data));
8318+    SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data));
8319+    SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data));
8320+  }
8321+  return index;
8322+}
8323+
8324+#undef MS_SIMD_INSTRUCTION
8325+#undef BLOCK_NUM
8326+
8327+#undef MS_SIMD_NEON
8328+#ifdef __cplusplus
8329+};
8330+#endif
8331+
8332+#endif  // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_
8333diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h
8334new file mode 100644
8335index 00000000..8fe26687
8336--- /dev/null
8337+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h
8338@@ -0,0 +1,55 @@
8339+/**
8340+ * Copyright 2022 Huawei Technologies Co., Ltd
8341+ *
8342+ * Licensed under the Apache License, Version 2.0 (the "License");
8343+ * you may not use this file except in compliance with the License.
8344+ * You may obtain a copy of the License at
8345+ *
8346+ * http://www.apache.org/licenses/LICENSE-2.0
8347+ *
8348+ * Unless required by applicable law or agreed to in writing, software
8349+ * distributed under the License is distributed on an "AS IS" BASIS,
8350+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8351+ * See the License for the specific language governing permissions and
8352+ * limitations under the License.
8353+ */
8354+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_NEON_H_
8355+#define MINDSPORE_NNACL_BASE_CAST_BASE_NEON_H_
8356+
8357+#include "nnacl/intrinsics/ms_simd_instructions.h"
8358+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8359+
8360+#ifdef __cplusplus
8361+extern "C" {
8362+#endif
8363+
8364+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8365+#define BLOCK_NUM 4
8366+#define MS_SIMD_NEON
8367+
8368+static inline int Int32ToFloat32NEON(int index, const int32_t *input, float *output, int number) {
8369+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8370+    SIMD_EPI32 value = SIMD_LD_EPI32(input + index);
8371+    SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value));
8372+  }
8373+  return index;
8374+}
8375+
8376+#ifndef MS_SIMD_NEON
8377+static inline int Float32ToInt32NEON(int index, const float *input, int32_t *output, int number) {
8378+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8379+    SIMD_F32 value = SIMD_LD_F32(input + index);
8380+    SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value));
8381+  }
8382+  return index;
8383+}
8384+#endif
8385+
8386+#undef MS_SIMD_INSTRUCTION
8387+#undef BLOCK_NUM
8388+
8389+#undef MS_SIMD_NEON
8390+#ifdef __cplusplus
8391+}
8392+#endif
8393+#endif
8394diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h
8395new file mode 100644
8396index 00000000..09f55bbf
8397--- /dev/null
8398+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h
8399@@ -0,0 +1,69 @@
8400+/**
8401+ * Copyright 2022 Huawei Technologies Co., Ltd
8402+ *
8403+ * Licensed under the Apache License, Version 2.0 (the "License");
8404+ * you may not use this file except in compliance with the License.
8405+ * You may obtain a copy of the License at
8406+ *
8407+ * http://www.apache.org/licenses/LICENSE-2.0
8408+ *
8409+ * Unless required by applicable law or agreed to in writing, software
8410+ * distributed under the License is distributed on an "AS IS" BASIS,
8411+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8412+ * See the License for the specific language governing permissions and
8413+ * limitations under the License.
8414+ */
8415+#ifndef MINDSPORE_NNACL_FP32_CDIST_NEON_H_
8416+#define MINDSPORE_NNACL_FP32_CDIST_NEON_H_
8417+
8418+#include "nnacl/intrinsics/ms_simd_instructions.h"
8419+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8420+
8421+#ifdef __cplusplus
8422+extern "C" {
8423+#endif
8424+
8425+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8426+#define BLOCK_NUM 4
8427+#define MS_SIMD_NEON
8428+
8429+static inline int64_t CdistTwoNormalOptNEON(int64_t index, const float *a, const float *b,
8430+                                                          float *out, int64_t size) {
8431+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
8432+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8433+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
8434+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
8435+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
8436+    tmp_vec = SIMD_ABS_F32(tmp_vec);
8437+    result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec);
8438+  }
8439+  *out += SIMD_GET_SUM_F32(result_vec);
8440+
8441+  return index;
8442+}
8443+
8444+static inline int64_t CdistPNormalOptNEON(int64_t index, const float *a, const float *b,
8445+                                                        float *out, int64_t size, float p) {
8446+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
8447+  SIMD_F32 p_vec = SIMD_MOV_F32(p);
8448+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8449+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
8450+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
8451+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
8452+    tmp_vec = SIMD_ABS_F32(tmp_vec);
8453+    tmp_vec = SIMD_POW_F32(tmp_vec, p_vec);
8454+    result_vec = SIMD_ADD_F32(tmp_vec, result_vec);
8455+  }
8456+  *out += SIMD_GET_SUM_F32(result_vec);
8457+
8458+  return index;
8459+}
8460+
8461+#undef MS_SIMD_INSTRUCTION
8462+#undef BLOCK_NUM
8463+
8464+#undef MS_SIMD_NEON
8465+#ifdef __cplusplus
8466+}
8467+#endif
8468+#endif
8469diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h
8470new file mode 100644
8471index 00000000..d8a2580a
8472--- /dev/null
8473+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h
8474@@ -0,0 +1,120 @@
8475+/**
8476+ * Copyright 2022 Huawei Technologies Co., Ltd
8477+ *
8478+ * Licensed under the Apache License, Version 2.0 (the "License");
8479+ * you may not use this file except in compliance with the License.
8480+ * You may obtain a copy of the License at
8481+ *
8482+ * http://www.apache.org/licenses/LICENSE-2.0
8483+ *
8484+ * Unless required by applicable law or agreed to in writing, software
8485+ * distributed under the License is distributed on an "AS IS" BASIS,
8486+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8487+ * See the License for the specific language governing permissions and
8488+ * limitations under the License.
8489+ */
8490+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_NEON_H_
8491+#define MINDSPORE_NNACL_FP32_CUMSUM_NEON_H_
8492+
8493+#include "nnacl/intrinsics/ms_simd_instructions.h"
8494+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8495+
8496+#ifdef __cplusplus
8497+extern "C" {
8498+#endif
8499+
8500+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8501+#define BLOCK_NUM 4
8502+#define MS_SIMD_NEON
8503+
8504+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
8505+// (a, b, c) -> (0, a,   a+b)    exclusive == true
8506+static inline int64_t CumsumOutputInitWithInputNEON(int64_t index, const float *layer_input,
8507+  float *layer_output, int inner_dim) {
8508+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8509+    SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index));
8510+  }
8511+  return index;
8512+}
8513+
8514+static inline int64_t CumsumOutputInitWithZeroNEON(int64_t index, float *layer_output, int inner_dim) {
8515+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8516+    SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f));
8517+  }
8518+  return index;
8519+}
8520+
8521+static inline int64_t CumsumNEON(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output,
8522+  int inner_dim) {
8523+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8524+    SIMD_F32 input_val = SIMD_LD_F32(layer_input + index);
8525+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index);
8526+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
8527+    SIMD_ST_F32(layer_output + index, out_val);
8528+  }
8529+  return index;
8530+}
8531+
8532+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
8533+// (a, b, c) -> (c+b, c, 0) exclusive==true
8534+static inline int64_t CumsumReverseNEON(int64_t index, const float *layer_input, float *layer_output,
8535+  float *layer_last_output, int inner_dim) {
8536+
8537+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8538+    SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1);
8539+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1);
8540+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
8541+    SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val);
8542+  }
8543+  return index;
8544+}
8545+
8546+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
8547+// (a, b, c) -> (0, a,   a+b)    exclusive == true
8548+static inline int64_t CumsumIntOutputInitWithInputNEON(int64_t index, const int *layer_input,
8549+  int *layer_output, int inner_dim) {
8550+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8551+    SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index));
8552+  }
8553+  return index;
8554+}
8555+
8556+static inline int64_t CumsumIntOutputInitWithZeroNEON(int64_t index, int *layer_output, int inner_dim) {
8557+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8558+    SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f));
8559+  }
8560+  return index;
8561+}
8562+
8563+static inline int64_t CumsumIntNEON(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
8564+  int inner_dim) {
8565+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8566+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index);
8567+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index);
8568+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
8569+    SIMD_ST_EPI32(layer_output + index, out_val);
8570+  }
8571+  return index;
8572+}
8573+
8574+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
8575+// (a, b, c) -> (c+b, c, 0) exclusive==true
8576+static inline int64_t CumsumReverseIntNEON(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
8577+  int inner_dim) {
8578+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8579+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1);
8580+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1);
8581+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
8582+    SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val);
8583+  }
8584+  return index;
8585+}
8586+
8587+#undef MS_SIMD_INSTRUCTION
8588+#undef BLOCK_NUM
8589+
8590+#undef MS_SIMD_NEON
8591+#ifdef __cplusplus
8592+}
8593+#endif
8594+#endif
8595diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h
8596new file mode 100644
8597index 00000000..c4ce6594
8598--- /dev/null
8599+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h
8600@@ -0,0 +1,166 @@
8601+/**
8602+ * Copyright 2022 Huawei Technologies Co., Ltd
8603+ *
8604+ * Licensed under the Apache License, Version 2.0 (the "License");
8605+ * you may not use this file except in compliance with the License.
8606+ * You may obtain a copy of the License at
8607+ *
8608+ * http://www.apache.org/licenses/LICENSE-2.0
8609+ *
8610+ * Unless required by applicable law or agreed to in writing, software
8611+ * distributed under the License is distributed on an "AS IS" BASIS,
8612+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8613+ * See the License for the specific language governing permissions and
8614+ * limitations under the License.
8615+ */
8616+
8617+#ifndef MINDSPORE_NNACL_FP32_DIV_NEON_H_
8618+#define MINDSPORE_NNACL_FP32_DIV_NEON_H_
8619+
8620+#include "nnacl/intrinsics/ms_simd_instructions.h"
8621+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8622+
8623+#ifdef __cplusplus
8624+extern "C" {
8625+#endif
8626+
8627+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8628+#define BLOCK_NUM 4
8629+#define MS_SIMD_NEON
8630+
8631+static inline int ElementOptDivNum0NEON(int index, const float *in0, const float *in1, float *out,
8632+                                                      int size) {
8633+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
8634+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8635+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
8636+    SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1);
8637+    SIMD_ST_F32(out + index, vout);
8638+  }
8639+  return index;
8640+}
8641+
8642+static inline int ElementOptDivNum1NEON(int index, const float *in0, const float *in1, float *out,
8643+                                                      int size) {
8644+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
8645+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8646+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
8647+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_);
8648+    SIMD_ST_F32(out + index, vout);
8649+  }
8650+  return index;
8651+}
8652+
8653+static inline int ElementOptDivIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
8654+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
8655+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8656+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
8657+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1);
8658+    SIMD_ST_EPI32(out + index, vout);
8659+  }
8660+  return index;
8661+}
8662+
8663+static inline int ElementOptDivIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
8664+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
8665+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8666+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
8667+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_);
8668+    SIMD_ST_EPI32(out + index, vout);
8669+  }
8670+  return index;
8671+}
8672+
8673+static inline int ElementOptDivReluNum0NEON(int index, const float *in0, const float *in1, float *out,
8674+                                                          int size) {
8675+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
8676+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8677+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
8678+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f);
8679+    SIMD_ST_F32(out + index, vout);
8680+  }
8681+  return index;
8682+}
8683+
8684+static inline int ElementOptDivReluNum1NEON(int index, const float *in0, const float *in1, float *out,
8685+                                                          int size) {
8686+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
8687+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8688+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
8689+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f);
8690+    SIMD_ST_F32(out + index, vout);
8691+  }
8692+  return index;
8693+}
8694+
8695+static inline int ElementOptDivRelu6Num0NEON(int index, const float *in0, const float *in1, float *out,
8696+                                                           int size) {
8697+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
8698+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8699+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
8700+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f);
8701+    SIMD_ST_F32(out + index, vout);
8702+  }
8703+  return index;
8704+}
8705+
8706+static inline int ElementOptDivRelu6Num1NEON(int index, const float *in0, const float *in1, float *out,
8707+                                                           int size) {
8708+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
8709+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8710+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
8711+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f);
8712+    SIMD_ST_F32(out + index, vout);
8713+  }
8714+  return index;
8715+}
8716+
8717+static inline int ElementDivNEON(int index, const float *in0, const float *in1, float *out, int size) {
8718+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8719+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
8720+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
8721+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1);
8722+    SIMD_ST_F32(out + index, vout);
8723+  }
8724+  return index;
8725+}
8726+
8727+static inline int ElementDivIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
8728+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8729+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
8730+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
8731+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1);
8732+    SIMD_ST_EPI32(out + index, vout);
8733+  }
8734+  return index;
8735+}
8736+
8737+static inline int ElementDivReluNEON(int index, const float *in0, const float *in1, float *out,
8738+                                                   int size) {
8739+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8740+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
8741+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
8742+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f);
8743+    SIMD_ST_F32(out + index, vout);
8744+  }
8745+  return index;
8746+}
8747+
8748+static inline int ElementDivRelu6NEON(int index, const float *in0, const float *in1, float *out,
8749+                                                    int size) {
8750+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8751+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
8752+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
8753+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f);
8754+    SIMD_ST_F32(out + index, vout);
8755+  }
8756+  return index;
8757+}
8758+
8759+#undef MS_SIMD_INSTRUCTION
8760+#undef BLOCK_NUM
8761+
8762+#undef MS_SIMD_NEON
8763+#ifdef __cplusplus
8764+};
8765+#endif
8766+#endif
8767diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h
8768new file mode 100644
8769index 00000000..b71db336
8770--- /dev/null
8771+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h
8772@@ -0,0 +1,45 @@
8773+/**
8774+ * Copyright 2022 Huawei Technologies Co., Ltd
8775+ *
8776+ * Licensed under the Apache License, Version 2.0 (the "License");
8777+ * you may not use this file except in compliance with the License.
8778+ * You may obtain a copy of the License at
8779+ *
8780+ * http://www.apache.org/licenses/LICENSE-2.0
8781+ *
8782+ * Unless required by applicable law or agreed to in writing, software
8783+ * distributed under the License is distributed on an "AS IS" BASIS,
8784+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8785+ * See the License for the specific language governing permissions and
8786+ * limitations under the License.
8787+ */
8788+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_NEON_H_
8789+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_NEON_H_
8790+
8791+#include "nnacl/intrinsics/ms_simd_instructions.h"
8792+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8793+
8794+#ifdef __cplusplus
8795+extern "C" {
8796+#endif
8797+
8798+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8799+#define BLOCK_NUM 4
8800+#define MS_SIMD_NEON
8801+
8802+static inline int DropoutFp32NEON(int index, const float *input, float scale,
8803+    int length, float *output) {
8804+    SIMD_F32 scale_value = SIMD_MOV_F32(scale);
8805+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8806+        SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value));
8807+    }
8808+    return index;
8809+}
8810+#undef MS_SIMD_INSTRUCTION
8811+#undef BLOCK_NUM
8812+
8813+#undef MS_SIMD_NEON
8814+#ifdef __cplusplus
8815+}
8816+#endif
8817+#endif
8818diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h
8819new file mode 100644
8820index 00000000..a594abd2
8821--- /dev/null
8822+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h
8823@@ -0,0 +1,62 @@
8824+/**
8825+ * Copyright 2022 Huawei Technologies Co., Ltd
8826+ *
8827+ * Licensed under the Apache License, Version 2.0 (the "License");
8828+ * you may not use this file except in compliance with the License.
8829+ * You may obtain a copy of the License at
8830+ *
8831+ * http://www.apache.org/licenses/LICENSE-2.0
8832+ *
8833+ * Unless required by applicable law or agreed to in writing, software
8834+ * distributed under the License is distributed on an "AS IS" BASIS,
8835+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8836+ * See the License for the specific language governing permissions and
8837+ * limitations under the License.
8838+ */
8839+
8840+#ifndef MINDSPORE_NNACL_FP32_DIV_NEON_H_
8841+#define MINDSPORE_NNACL_FP32_DIV_NEON_H_
8842+
8843+#include "nnacl/intrinsics/ms_simd_instructions.h"
8844+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8845+
8846+#ifdef __cplusplus
8847+extern "C" {
8848+#endif
8849+
8850+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8851+#define BLOCK_NUM 4
8852+#define MS_SIMD_NEON
8853+
8854+static inline int64_t ExpFp32NEON(int64_t index, const float *src, float *dst, int num) {
8855+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8856+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
8857+  }
8858+  return index;
8859+}
8860+
8861+static inline int64_t ExpFp32WithInScaleNEON(int64_t index, const float *src, float *dst, int num, float in_scale) {
8862+  SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale);
8863+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8864+    SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index);
8865+  }
8866+  return index;
8867+}
8868+
8869+static inline int64_t ExpFp32WithOutScaleNEON(int64_t index, const float *src, float *dst, int num, float out_scale) {
8870+  SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale);
8871+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8872+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
8873+    SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec));
8874+  }
8875+  return index;
8876+}
8877+
8878+#undef MS_SIMD_INSTRUCTION
8879+#undef BLOCK_NUM
8880+
8881+#undef MS_SIMD_NEON
8882+#ifdef __cplusplus
8883+};
8884+#endif
8885+#endif
8886diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h
8887new file mode 100644
8888index 00000000..c467d2d9
8889--- /dev/null
8890+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h
8891@@ -0,0 +1,52 @@
8892+/**
8893+ * Copyright 2022 Huawei Technologies Co., Ltd
8894+ *
8895+ * Licensed under the Apache License, Version 2.0 (the "License");
8896+ * you may not use this file except in compliance with the License.
8897+ * You may obtain a copy of the License at
8898+ *
8899+ * http://www.apache.org/licenses/LICENSE-2.0
8900+ *
8901+ * Unless required by applicable law or agreed to in writing, software
8902+ * distributed under the License is distributed on an "AS IS" BASIS,
8903+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8904+ * See the License for the specific language governing permissions and
8905+ * limitations under the License.
8906+ */
8907+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_NEON_H_
8908+#define MINDSPORE_NNACL_BASE_FILL_BASE_NEON_H_
8909+
8910+#include "nnacl/intrinsics/ms_simd_instructions.h"
8911+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8912+
8913+#ifdef __cplusplus
8914+extern "C" {
8915+#endif
8916+
8917+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8918+#define BLOCK_NUM 4
8919+#define MS_SIMD_NEON
8920+
8921+static inline int FillFp32NEON(int index, float *output, int size, float data) {
8922+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8923+    SIMD_ST_F32(output + index, SIMD_MOV_F32(data));
8924+  }
8925+  return index;
8926+}
8927+
8928+static inline int FillInt32NEON(int index, int *output, int size, int data) {
8929+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8930+    SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data));
8931+  }
8932+  return index;
8933+}
8934+
8935+#undef MS_SIMD_INSTRUCTION
8936+#undef BLOCK_NUM
8937+
8938+#undef MS_SIMD_NEON
8939+#ifdef __cplusplus
8940+}
8941+#endif
8942+#endif
8943+
8944diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h
8945new file mode 100644
8946index 00000000..0eb6c9d2
8947--- /dev/null
8948+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h
8949@@ -0,0 +1,76 @@
8950+/**
8951+ * Copyright 2022 Huawei Technologies Co., Ltd
8952+ *
8953+ * Licensed under the Apache License, Version 2.0 (the "License");
8954+ * you may not use this file except in compliance with the License.
8955+ * You may obtain a copy of the License at
8956+ *
8957+ * http://www.apache.org/licenses/LICENSE-2.0
8958+ *
8959+ * Unless required by applicable law or agreed to in writing, software
8960+ * distributed under the License is distributed on an "AS IS" BASIS,
8961+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8962+ * See the License for the specific language governing permissions and
8963+ * limitations under the License.
8964+ */
8965+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_NEON_H_
8966+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_NEON_H_
8967+
8968+#include "nnacl/intrinsics/ms_simd_instructions.h"
8969+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
8970+
8971+#ifdef __cplusplus
8972+extern "C" {
8973+#endif
8974+
8975+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
8976+#define BLOCK_NUM 4
8977+#define MS_SIMD_NEON
8978+
8979+static inline int64_t GroupNormFp32NEON(int64_t index, const float *unit_input, float scale, float offset, float mean,
8980+  float var_sqrt, int unit, float *unit_output) {
8981+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
8982+  SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt);
8983+  SIMD_F32 scale_val = SIMD_MOV_F32(scale);
8984+  SIMD_F32 offset_val = SIMD_MOV_F32(offset);
8985+  for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8986+    SIMD_F32 input = SIMD_LD_F32(unit_input + index);
8987+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt);
8988+    SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val);
8989+    SIMD_ST_F32(unit_output + index, output);
8990+  }
8991+  return index;
8992+}
8993+
8994+static inline int64_t GroupNormReduceSumNEON(int64_t index, const float *in, float *sum, int unit) {
8995+  if (unit - index >= 4 * BLOCK_NUM) {
8996+    SIMD_F32 tmp = SIMD_MOV_F32(0);
8997+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
8998+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index));
8999+    }
9000+    *sum += SIMD_GET_SUM_F32(tmp);
9001+  }
9002+  return index;
9003+}
9004+
9005+static inline int64_t GroupNormReduceVarNEON(int64_t index, const float *in, float mean, float *sum, int unit) {
9006+  if (unit - index >= 4 * BLOCK_NUM) {
9007+    SIMD_F32 mean_val = SIMD_MOV_F32(mean);
9008+    SIMD_F32 tmp = SIMD_MOV_F32(0);
9009+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9010+      SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val);
9011+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input));
9012+    }
9013+    *sum += SIMD_GET_SUM_F32(tmp);
9014+  }
9015+  return index;
9016+}
9017+
9018+#undef MS_SIMD_INSTRUCTION
9019+#undef BLOCK_NUM
9020+
9021+#undef MS_SIMD_NEON
9022+#ifdef __cplusplus
9023+}
9024+#endif
9025+#endif
9026diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h
9027new file mode 100644
9028index 00000000..0c528616
9029--- /dev/null
9030+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h
9031@@ -0,0 +1,67 @@
9032+/**
9033+ * Copyright 2022 Huawei Technologies Co., Ltd
9034+ *
9035+ * Licensed under the Apache License, Version 2.0 (the "License");
9036+ * you may not use this file except in compliance with the License.
9037+ * You may obtain a copy of the License at
9038+ *
9039+ * http://www.apache.org/licenses/LICENSE-2.0
9040+ *
9041+ * Unless required by applicable law or agreed to in writing, software
9042+ * distributed under the License is distributed on an "AS IS" BASIS,
9043+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9044+ * See the License for the specific language governing permissions and
9045+ * limitations under the License.
9046+ */
9047+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_NEON_H_
9048+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_NEON_H_
9049+
9050+#include "nnacl/intrinsics/ms_simd_instructions.h"
9051+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
9052+
9053+#ifdef __cplusplus
9054+extern "C" {
9055+#endif
9056+
9057+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
9058+#define BLOCK_NUM 4
9059+#define MS_SIMD_NEON
9060+
9061+static inline int LayerNormMeanAndSquareNEON(int index, const float *src, int num, float *mean, float *square_mean) {
9062+  if (num >= 4 * BLOCK_NUM) {
9063+    SIMD_F32 sum_val = SIMD_SET0_F32;
9064+    SIMD_F32 square_sum_val = SIMD_SET0_F32;
9065+    for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9066+      SIMD_F32 value = SIMD_LD_F32(src + index);
9067+      SIMD_F32 square_value = SIMD_MUL_F32(value, value);
9068+      sum_val = SIMD_ADD_F32(sum_val, value);
9069+      square_sum_val = SIMD_ADD_F32(square_sum_val, square_value);
9070+    }
9071+    *mean += SIMD_GET_SUM_F32(sum_val);
9072+    *square_mean += SIMD_GET_SUM_F32(square_sum_val);
9073+  }
9074+  return index;
9075+}
9076+
9077+static inline int LayerNormGammaAndBetaNEON(int index, float *dst, const float *src, const float *gamma_data,
9078+  const float *beta_data, int num, const float mean, const float deno) {
9079+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
9080+  SIMD_F32 deno_val = SIMD_MOV_F32(deno);
9081+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9082+    SIMD_F32 value = SIMD_LD_F32(src + index);
9083+    SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val);
9084+    out_value = SIMD_MUL_F32(out_value, deno_val);
9085+    out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index));
9086+    SIMD_ST_F32(dst + index, out_value);
9087+  }
9088+  return index;
9089+}
9090+
9091+#undef MS_SIMD_INSTRUCTION
9092+#undef BLOCK_NUM
9093+
9094+#undef MS_SIMD_NEON
9095+#ifdef __cplusplus
9096+}
9097+#endif
9098+#endif
9099diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h
9100new file mode 100644
9101index 00000000..0e12e5a0
9102--- /dev/null
9103+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h
9104@@ -0,0 +1,92 @@
9105+/**
9106+ * Copyright 2022 Huawei Technologies Co., Ltd
9107+ *
9108+ * Licensed under the Apache License, Version 2.0 (the "License");
9109+ * you may not use this file except in compliance with the License.
9110+ * You may obtain a copy of the License at
9111+ *
9112+ * http://www.apache.org/licenses/LICENSE-2.0
9113+ *
9114+ * Unless required by applicable law or agreed to in writing, software
9115+ * distributed under the License is distributed on an "AS IS" BASIS,
9116+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9117+ * See the License for the specific language governing permissions and
9118+ * limitations under the License.
9119+ */
9120+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_NEON_H_
9121+#define MINDSPORE_NNACL_FP32_MATMUL_F32_NEON_H_
9122+
9123+#include "nnacl/intrinsics/ms_simd_instructions.h"
9124+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
9125+
9126+#ifdef __cplusplus
9127+extern "C" {
9128+#endif
9129+
9130+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
9131+#define BLOCK_NUM 4
9132+#define MS_SIMD_NEON
9133+
9134+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6.
9135+static inline int64_t GemmIsNotPackNEON(int64_t index, const float *a, const float *b, float *c, const float *bias, int row,
9136+  int deep, int act_type) {
9137+  SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f);
9138+  SIMD_F32 up_threshold = SIMD_MOV_F32(6);
9139+  SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]);
9140+  SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]);
9141+  for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9142+    SIMD_F32 a_data = SIMD_LD_F32(a + index);
9143+    SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16);
9144+    if (act_type != 0) {
9145+      dst = SIMD_MAX_F32(dst, down_threshold);
9146+      if (act_type == 3) {
9147+        dst = SIMD_MIN_F32(dst, up_threshold);
9148+      }
9149+    }
9150+    SIMD_ST_F32(c + index, dst);
9151+  }
9152+
9153+  return index;
9154+}
9155+
9156+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX)
9157+static inline int64_t GemmIsNotPackOptimizeCoreNEON(int64_t index, const float *a, const float *b, int k, float *dst) {
9158+  SIMD_F32 dst1 = SIMD_MOV_F32(0.0f);
9159+  for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9160+    SIMD_F32 weight = SIMD_LD_F32(b + index);
9161+    SIMD_F32 a1 = SIMD_LD_F32(a + index);
9162+    dst1 = SIMD_FMADD_F32(weight, a1, dst1);
9163+  }
9164+  *dst += SIMD_REDUCE_ADD_F32(dst1);
9165+  return index;
9166+}
9167+#endif
9168+
9169+static inline int64_t MatVecMulNoPackCoreNEON(int64_t oc_index, const float *a, const float *b, float *c, const float *bias,
9170+  int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) {
9171+  for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) {
9172+    SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index));
9173+    for (int64_t k = 0; k < depth; ++k) {
9174+      SIMD_F32 left = SIMD_MOV_F32(a[k]);
9175+      SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col);
9176+      out = SIMD_FMADD_F32(left, right, out);
9177+    }
9178+    if ((inc_flag & 0x2) != 0 && act_type != 0) {
9179+      out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f));
9180+      if (act_type == 0x3) {
9181+        out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f));
9182+      }
9183+    }
9184+    SIMD_ST_F32(c + oc_index, out);
9185+  }
9186+  return oc_index;
9187+}
9188+
9189+#undef MS_SIMD_INSTRUCTION
9190+#undef BLOCK_NUM
9191+
9192+#undef MS_SIMD_NEON
9193+#ifdef __cplusplus
9194+}
9195+#endif
9196+#endif
9197diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h
9198new file mode 100644
9199index 00000000..33506e0c
9200--- /dev/null
9201+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h
9202@@ -0,0 +1,217 @@
9203+/**
9204+ * Copyright 2022 Huawei Technologies Co., Ltd
9205+ *
9206+ * Licensed under the Apache License, Version 2.0 (the "License");
9207+ * you may not use this file except in compliance with the License.
9208+ * You may obtain a copy of the License at
9209+ *
9210+ * http://www.apache.org/licenses/LICENSE-2.0
9211+ *
9212+ * Unless required by applicable law or agreed to in writing, software
9213+ * distributed under the License is distributed on an "AS IS" BASIS,
9214+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9215+ * See the License for the specific language governing permissions and
9216+ * limitations under the License.
9217+ */
9218+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
9219+#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_
9220+
9221+#include "nnacl/intrinsics/ms_simd_instructions.h"
9222+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
9223+
9224+#ifdef __cplusplus
9225+extern "C" {
9226+#endif
9227+
9228+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
9229+#define BLOCK_NUM 4
9230+#define MS_SIMD_NEON
9231+
9232+static inline int ElementMulNEON(int index, const float *in0, const float *in1, float *out, int size) {
9233+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9234+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
9235+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
9236+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1);
9237+    SIMD_ST_F32(out + index, vout);
9238+  }
9239+  return index;
9240+}
9241+
9242+static inline int ElementMulReluNEON(int index, const float *in0, const float *in1, float *out, int size) {
9243+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9244+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
9245+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
9246+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f);
9247+    SIMD_ST_F32(out + index, vout);
9248+  }
9249+  return index;
9250+}
9251+
9252+static inline int ElementMulRelu6NEON(int index, const float *in0, const float *in1, float *out, int size) {
9253+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9254+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
9255+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
9256+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f);
9257+    SIMD_ST_F32(out + index, vout);
9258+  }
9259+  return index;
9260+}
9261+
9262+static inline int ElementMulIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
9263+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9264+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
9265+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
9266+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1);
9267+    SIMD_ST_EPI32(out + index, vout);
9268+  }
9269+  return index;
9270+}
9271+
9272+static inline int ElementMulReluIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
9273+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9274+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
9275+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
9276+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f);
9277+    SIMD_ST_EPI32(out + index, vout);
9278+  }
9279+  return index;
9280+}
9281+
9282+static inline int ElementMulRelu6IntNEON(int index, const int *in0, const int *in1, int *out, int size) {
9283+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9284+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
9285+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
9286+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f);
9287+    SIMD_ST_EPI32(out + index, vout);
9288+  }
9289+  return index;
9290+}
9291+
9292+static inline int ElementOptMulNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
9293+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
9294+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9295+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
9296+    SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1);
9297+    SIMD_ST_F32(out + index, vout);
9298+  }
9299+  return index;
9300+}
9301+
9302+static inline int ElementOptMulNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
9303+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
9304+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9305+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
9306+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_);
9307+    SIMD_ST_F32(out + index, vout);
9308+  }
9309+  return index;
9310+}
9311+
9312+static inline int ElementOptMulReluNum0NEON(int index, const float *in0, const float *in1, float *out, int size) {
9313+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
9314+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9315+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
9316+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f);
9317+    SIMD_ST_F32(out + index, vout);
9318+  }
9319+  return index;
9320+}
9321+
9322+static inline int ElementOptMulReluNum1NEON(int index, const float *in0, const float *in1, float *out, int size) {
9323+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
9324+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9325+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
9326+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f);
9327+    SIMD_ST_F32(out + index, vout);
9328+  }
9329+  return index;
9330+}
9331+
9332+static inline int ElementOptMulRelu6Num0NEON(int index, const float *in0, const float *in1, float *out, int size) {
9333+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
9334+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9335+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
9336+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f);
9337+    SIMD_ST_F32(out + index, vout);
9338+  }
9339+  return index;
9340+}
9341+
9342+static inline int ElementOptMulRelu6Num1NEON(int index, const float *in0, const float *in1, float *out, int size) {
9343+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
9344+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9345+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
9346+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f);
9347+    SIMD_ST_F32(out + index, vout);
9348+  }
9349+  return index;
9350+}
9351+
9352+static inline int ElementOptMulIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
9353+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
9354+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9355+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
9356+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1);
9357+    SIMD_ST_EPI32(out + index, vout);
9358+  }
9359+  return index;
9360+}
9361+
9362+static inline int ElementOptMulIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
9363+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
9364+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9365+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
9366+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_);
9367+    SIMD_ST_EPI32(out + index, vout);
9368+  }
9369+  return index;
9370+}
9371+
9372+static inline int ElementOptMulReluIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
9373+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
9374+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9375+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
9376+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f);
9377+    SIMD_ST_EPI32(out + index, vout);
9378+  }
9379+  return index;
9380+}
9381+
9382+static inline int ElementOptMulReluIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
9383+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
9384+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9385+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
9386+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f);
9387+    SIMD_ST_EPI32(out + index, vout);
9388+  }
9389+  return index;
9390+}
9391+
9392+static inline int ElementOptMulRelu6IntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
9393+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
9394+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9395+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
9396+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f);
9397+    SIMD_ST_EPI32(out + index, vout);
9398+  }
9399+  return index;
9400+}
9401+
9402+static inline int ElementOptMulRelu6IntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
9403+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
9404+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9405+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
9406+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f);
9407+    SIMD_ST_EPI32(out + index, vout);
9408+  }
9409+  return index;
9410+}
9411+
9412+#undef MS_SIMD_INSTRUCTION
9413+#undef BLOCK_NUM
9414+
9415+#undef MS_SIMD_NEON
9416+#ifdef __cplusplus
9417+}
9418+#endif
9419+#endif
9420diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h
9421new file mode 100644
9422index 00000000..ea6acf62
9423--- /dev/null
9424+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h
9425@@ -0,0 +1,83 @@
9426+/**
9427+ * Copyright 2022 Huawei Technologies Co., Ltd
9428+ *
9429+ * Licensed under the Apache License, Version 2.0 (the "License");
9430+ * you may not use this file except in compliance with the License.
9431+ * You may obtain a copy of the License at
9432+ *
9433+ * http://www.apache.org/licenses/LICENSE-2.0
9434+ *
9435+ * Unless required by applicable law or agreed to in writing, software
9436+ * distributed under the License is distributed on an "AS IS" BASIS,
9437+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9438+ * See the License for the specific language governing permissions and
9439+ * limitations under the License.
9440+ */
9441+#ifndef MINDSPORE_NNACL_FP32_POOLING_NEON_H_
9442+#define MINDSPORE_NNACL_FP32_POOLING_NEON_H_
9443+
9444+#include "nnacl/intrinsics/ms_simd_instructions.h"
9445+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
9446+
9447+#ifdef __cplusplus
9448+extern "C" {
9449+#endif
9450+
9451+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
9452+#define BLOCK_NUM 4
9453+#define MS_SIMD_NEON
9454+
9455+static inline int AvgPoolingBatchNEON(int ci, const float *src_plane_ptr, int channel,
9456+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
9457+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
9458+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
9459+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
9460+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
9461+    const float *src_c_ptr = src_plane_ptr + ci;
9462+    float *dst_c_ptr = dst_plane_ptr + ci;
9463+    SIMD_F32 tmp_avg = SIMD_SET0_F32;
9464+    int real_count = 0;
9465+    for (int h = real_win_h_start; h < real_win_h_end; h++) {
9466+      for (int w = real_win_w_start; w < real_win_w_end; w++) {
9467+        const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
9468+        tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr));
9469+        ++real_count;
9470+      }
9471+    }
9472+    tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count));
9473+    tmp_avg = SIMD_MAX_F32(tmp_avg, min_val);
9474+    tmp_avg = SIMD_MIN_F32(tmp_avg, max_val);
9475+    SIMD_ST_F32(dst_c_ptr, tmp_avg);
9476+  }
9477+  return ci;
9478+}
9479+
9480+static inline int MaxPoolingBatchNEON(int ci, const float *src_plane_ptr, int channel,
9481+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
9482+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
9483+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
9484+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
9485+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
9486+    const float *src_c_ptr = src_plane_ptr + ci;
9487+    float *dst_c_ptr = dst_plane_ptr + ci;
9488+    SIMD_F32 tmp_max = min_val;
9489+    for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
9490+      for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
9491+        const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
9492+        tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr));
9493+      }
9494+    }
9495+    tmp_max = SIMD_MIN_F32(tmp_max, max_val);
9496+    SIMD_ST_F32(dst_c_ptr, tmp_max);
9497+  }
9498+  return ci;
9499+}
9500+
9501+#undef MS_SIMD_INSTRUCTION
9502+#undef BLOCK_NUM
9503+
9504+#undef MS_SIMD_NEON
9505+#ifdef __cplusplus
9506+}
9507+#endif
9508+#endif
9509diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h
9510new file mode 100644
9511index 00000000..fd8699c7
9512--- /dev/null
9513+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h
9514@@ -0,0 +1,100 @@
9515+/**
9516+ * Copyright 2022 Huawei Technologies Co., Ltd
9517+ *
9518+ * Licensed under the Apache License, Version 2.0 (the "License");
9519+ * you may not use this file except in compliance with the License.
9520+ * You may obtain a copy of the License at
9521+ *
9522+ * http://www.apache.org/licenses/LICENSE-2.0
9523+ *
9524+ * Unless required by applicable law or agreed to in writing, software
9525+ * distributed under the License is distributed on an "AS IS" BASIS,
9526+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9527+ * See the License for the specific language governing permissions and
9528+ * limitations under the License.
9529+ */
9530+#ifndef MINDSPORE_NNACL_FP32_POWER_NEON_H_
9531+#define MINDSPORE_NNACL_FP32_POWER_NEON_H_
9532+
9533+#include "nnacl/intrinsics/ms_simd_instructions.h"
9534+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
9535+
9536+#ifdef __cplusplus
9537+extern "C" {
9538+#endif
9539+
9540+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
9541+#define BLOCK_NUM 4
9542+#define MS_SIMD_NEON
9543+
9544+static inline int PowerBroadCastIntExponentNEON(int index, const float *input, int exponent, float *output, int len,
9545+  float scale, float shift) {
9546+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
9547+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
9548+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9549+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
9550+    SIMD_F32 result = SIMD_MOV_F32(1.0f);
9551+    int exp = abs(exponent);
9552+    while (exp) {
9553+      if (exp % 2) {
9554+        result = SIMD_MUL_F32(result, tmp);
9555+      }
9556+      tmp = SIMD_MUL_SQUARE_F32(tmp);
9557+      exp = exp / 2;
9558+    }
9559+    SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result));
9560+  }
9561+  return index;
9562+}
9563+
9564+static inline int PowerBroadCastFloatExponentNEON(int index, const float *input, float exponent, float *output, int len,
9565+  float scale, float shift) {
9566+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
9567+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
9568+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9569+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
9570+    SIMD_F32 result;
9571+    for (int i = 0; i < BLOCK_NUM; ++i) {
9572+      SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent);
9573+    }
9574+    SIMD_ST_F32(output + index, result);
9575+  }
9576+  return index;
9577+}
9578+
9579+static inline int PowerSingleExponentNEON(int index, const float *input, const float *exponent, float *output, int len,
9580+  float scale, float shift) {
9581+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
9582+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
9583+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9584+    SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
9585+    for (int j = 0; j < BLOCK_NUM; ++j) {
9586+      float cur_exponent = exponent[index + j];
9587+      float cur_val = SIMD_F32_GETI(tmp_vec, j);
9588+      if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) {
9589+        int exp = abs((int)(cur_exponent));
9590+        float result = 1;
9591+        while (exp) {
9592+          if (exp % 2) {
9593+            result *= cur_val;
9594+          }
9595+          cur_val *= cur_val;
9596+          exp = exp / 2;
9597+        }
9598+        output[index + j] = *exponent >= 0 ? result : 1 / result;
9599+      } else {
9600+        output[index + j] = powf(cur_val, cur_exponent);
9601+      }
9602+    }
9603+  }
9604+  return index;
9605+}
9606+
9607+#undef MS_SIMD_INSTRUCTION
9608+#undef BLOCK_NUM
9609+
9610+#undef MS_SIMD_NEON
9611+#ifdef __cplusplus
9612+}
9613+#endif
9614+#endif
9615diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h
9616new file mode 100644
9617index 00000000..7f9153f8
9618--- /dev/null
9619+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h
9620@@ -0,0 +1,180 @@
9621+/**
9622+ * Copyright 2022 Huawei Technologies Co., Ltd
9623+ *
9624+ * Licensed under the Apache License, Version 2.0 (the "License");
9625+ * you may not use this file except in compliance with the License.
9626+ * You may obtain a copy of the License at
9627+ *
9628+ * http://www.apache.org/licenses/LICENSE-2.0
9629+ *
9630+ * Unless required by applicable law or agreed to in writing, software
9631+ * distributed under the License is distributed on an "AS IS" BASIS,
9632+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9633+ * See the License for the specific language governing permissions and
9634+ * limitations under the License.
9635+ */
9636+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_NEON_H_
9637+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_NEON_H_
9638+
9639+#include "nnacl/intrinsics/ms_simd_instructions.h"
9640+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
9641+
9642+#ifdef __cplusplus
9643+extern "C" {
9644+#endif
9645+
9646+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
9647+#define BLOCK_NUM 4
9648+#define MS_SIMD_NEON
9649+
9650+static inline int64_t ReduceSumNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
9651+  int axis_size) {
9652+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9653+    const float *inner_src = outer_src + index;
9654+    SIMD_F32 tmp = SIMD_MOV_F32(0);
9655+    for (int i = 0; i < axis_size; i++) {
9656+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
9657+    }
9658+    SIMD_ST_F32(outer_dst + index, tmp);
9659+  }
9660+  return index;
9661+}
9662+
9663+static inline int64_t ReduceMeanNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
9664+  int axis_size) {
9665+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9666+    const float *inner_src = outer_src + index;
9667+    SIMD_F32 tmp = SIMD_MOV_F32(0);
9668+    for (int i = 0; i < axis_size; i++) {
9669+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
9670+    }
9671+    SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size));
9672+  }
9673+  return index;
9674+}
9675+
9676+static inline int64_t ReduceMinNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
9677+  int axis_size) {
9678+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9679+    const float *inner_src = outer_src + index;
9680+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX);
9681+    for (int i = 0; i < axis_size; i++) {
9682+      tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
9683+    }
9684+    SIMD_ST_F32(outer_dst + index, tmp);
9685+  }
9686+  return index;
9687+}
9688+
9689+static inline int64_t ReduceMaxNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
9690+  int axis_size) {
9691+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9692+    const float *inner_src = outer_src + index;
9693+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN);
9694+    for (int i = 0; i < axis_size; i++) {
9695+      tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
9696+    }
9697+    SIMD_ST_F32(outer_dst + index, tmp);
9698+  }
9699+  return index;
9700+}
9701+
9702+static inline int64_t ReduceProdNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
9703+  int axis_size) {
9704+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9705+    const float *inner_src = outer_src + index;
9706+    SIMD_F32 tmp = SIMD_MOV_F32(1.0f);
9707+    for (int i = 0; i < axis_size; i++) {
9708+      tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
9709+    }
9710+    SIMD_ST_F32(outer_dst + index, tmp);
9711+  }
9712+  return index;
9713+}
9714+
9715+static inline int64_t ReduceSumSquareNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
9716+  int axis_size) {
9717+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9718+    const float *inner_src = outer_src + index;
9719+    SIMD_F32 tmp = SIMD_MOV_F32(0);
9720+    for (int i = 0; i < axis_size; i++) {
9721+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
9722+    }
9723+    SIMD_ST_F32(outer_dst + index, tmp);
9724+  }
9725+  return index;
9726+}
9727+
9728+static inline int64_t ReduceL2NormNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
9729+  int axis_size) {
9730+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9731+    const float *inner_src = outer_src + index;
9732+    SIMD_F32 tmp = SIMD_MOV_F32(0);
9733+    for (int i = 0; i < axis_size; i++) {
9734+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
9735+    }
9736+    SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp));
9737+  }
9738+  return index;
9739+}
9740+
9741+static inline int64_t IntReduceSumNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
9742+  int axis_size) {
9743+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9744+    const int *inner_src = outer_src + index;
9745+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
9746+    for (int i = 0; i < axis_size; i++) {
9747+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
9748+    }
9749+    SIMD_ST_EPI32(outer_dst + index, tmp);
9750+  }
9751+  return index;
9752+}
9753+
9754+static inline int64_t IntReduceMeanNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
9755+  int axis_size) {
9756+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9757+    const int *inner_src = outer_src + index;
9758+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
9759+    for (int i = 0; i < axis_size; i++) {
9760+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
9761+    }
9762+    SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size));
9763+  }
9764+  return index;
9765+}
9766+
9767+static inline int64_t IntReduceMinNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
9768+  int axis_size) {
9769+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9770+    const int *inner_src = outer_src + index;
9771+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX);
9772+    for (int i = 0; i < axis_size; i++) {
9773+      tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
9774+    }
9775+    SIMD_ST_EPI32(outer_dst + index, tmp);
9776+  }
9777+  return index;
9778+}
9779+
9780+static inline int64_t IntReduceMaxNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
9781+  int axis_size) {
9782+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9783+    const int *inner_src = outer_src + index;
9784+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN);
9785+    for (int i = 0; i < axis_size; i++) {
9786+      tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
9787+    }
9788+    SIMD_ST_EPI32(outer_dst + index, tmp);
9789+  }
9790+  return index;
9791+}
9792+
9793+#undef MS_SIMD_INSTRUCTION
9794+#undef BLOCK_NUM
9795+
9796+#undef MS_SIMD_NEON
9797+#ifdef __cplusplus
9798+}
9799+#endif
9800+#endif
9801diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h
9802new file mode 100644
9803index 00000000..f116d92f
9804--- /dev/null
9805+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h
9806@@ -0,0 +1,86 @@
9807+/**
9808+ * Copyright 2022 Huawei Technologies Co., Ltd
9809+ *
9810+ * Licensed under the Apache License, Version 2.0 (the "License");
9811+ * you may not use this file except in compliance with the License.
9812+ * You may obtain a copy of the License at
9813+ *
9814+ * http://www.apache.org/licenses/LICENSE-2.0
9815+ *
9816+ * Unless required by applicable law or agreed to in writing, software
9817+ * distributed under the License is distributed on an "AS IS" BASIS,
9818+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9819+ * See the License for the specific language governing permissions and
9820+ * limitations under the License.
9821+ */
9822+
9823+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_NEON_H_
9824+#define MINDSPORE_NNACL_FP32_SOFTMAX_NEON_H_
9825+
9826+#include "nnacl/intrinsics/ms_simd_instructions.h"
9827+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
9828+
9829+#ifdef __cplusplus
9830+extern "C" {
9831+#endif
9832+
9833+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
9834+#define BLOCK_NUM 4
9835+#define MS_SIMD_NEON
9836+
9837+static inline int64_t SoftmaxNormGetMaxNEON(int64_t index, const float *src, int cur_batch_offset,
9838+  float *max, int channel) {
9839+  if (channel >= BLOCK_NUM * BLOCK_NUM) {
9840+    SIMD_F32 max_val = SIMD_MOV_F32(*max);
9841+    for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9842+      max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index));
9843+    }
9844+    *max = SIMD_GET_MAX_F32(max_val);
9845+  }
9846+  return index;
9847+}
9848+
9849+static inline int64_t SoftmaxNormCalcNormNEON(int64_t index, const float *src, float *dst,
9850+  int cur_batch_offset, float max, int channel) {
9851+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9852+    SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max));
9853+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
9854+  }
9855+  return index;
9856+}
9857+
9858+static inline int64_t SoftmaxLastAxisGetExpSumNEON(int64_t index, const float *src, float *dst,
9859+  int cur_batch_offset, float max, float *exp_sum, int channel) {
9860+#ifndef _WIN32
9861+  SIMD_F32 sum_val = SIMD_SET0_F32;
9862+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9863+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
9864+    SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max));
9865+    SIMD_F32 exp_out = SIMD_EXP_F32(output);
9866+    sum_val = SIMD_ADD_F32(sum_val, exp_out);
9867+    SIMD_ST_F32(dst + cur_batch_offset + index, exp_out);
9868+  }
9869+  *exp_sum += SIMD_GET_SUM_F32(sum_val);
9870+#endif
9871+  return index;
9872+}
9873+
9874+static inline int64_t SoftmaxLastAxisGetResultNEON(int64_t index, const float *src, float *dst,
9875+  int cur_batch_offset, float exp_sum, int channel) {
9876+  SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum);
9877+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9878+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
9879+    SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val);
9880+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
9881+  }
9882+  return index;
9883+}
9884+
9885+#undef MS_SIMD_INSTRUCTION
9886+#undef BLOCK_NUM
9887+
9888+#undef MS_SIMD_NEON
9889+#ifdef __cplusplus
9890+};
9891+#endif
9892+#endif
9893diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h
9894new file mode 100644
9895index 00000000..d2731101
9896--- /dev/null
9897+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h
9898@@ -0,0 +1,166 @@
9899+/**
9900+ * Copyright 2022 Huawei Technologies Co., Ltd
9901+ *
9902+ * Licensed under the Apache License, Version 2.0 (the "License");
9903+ * you may not use this file except in compliance with the License.
9904+ * You may obtain a copy of the License at
9905+ *
9906+ * http://www.apache.org/licenses/LICENSE-2.0
9907+ *
9908+ * Unless required by applicable law or agreed to in writing, software
9909+ * distributed under the License is distributed on an "AS IS" BASIS,
9910+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9911+ * See the License for the specific language governing permissions and
9912+ * limitations under the License.
9913+ */
9914+
9915+#ifndef MINDSPORE_NNACL_FP32_SUB_NEON_H_
9916+#define MINDSPORE_NNACL_FP32_SUB_NEON_H_
9917+
9918+#include "nnacl/intrinsics/ms_simd_instructions.h"
9919+#include "nnacl/intrinsics/ms_simd_neon_instructions.h"
9920+
9921+#ifdef __cplusplus
9922+extern "C" {
9923+#endif
9924+
9925+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION
9926+#define BLOCK_NUM 4
9927+#define MS_SIMD_NEON
9928+
9929+static inline int ElementOptSubNum0NEON(int index, const float *in0, const float *in1, float *out,
9930+                                                      int size) {
9931+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
9932+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9933+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
9934+    SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1);
9935+    SIMD_ST_F32(out + index, vout);
9936+  }
9937+  return index;
9938+}
9939+
9940+static inline int ElementOptSubNum1NEON(int index, const float *in0, const float *in1, float *out,
9941+                                                      int size) {
9942+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
9943+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9944+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
9945+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_);
9946+    SIMD_ST_F32(out + index, vout);
9947+  }
9948+  return index;
9949+}
9950+
9951+static inline int ElementOptSubIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) {
9952+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
9953+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9954+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
9955+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1);
9956+    SIMD_ST_EPI32(out + index, vout);
9957+  }
9958+  return index;
9959+}
9960+
9961+static inline int ElementOptSubIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) {
9962+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
9963+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9964+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
9965+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_);
9966+    SIMD_ST_EPI32(out + index, vout);
9967+  }
9968+  return index;
9969+}
9970+
9971+static inline int ElementOptSubReluNum0NEON(int index, const float *in0, const float *in1, float *out,
9972+                                                          int size) {
9973+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
9974+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9975+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
9976+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f);
9977+    SIMD_ST_F32(out + index, vout);
9978+  }
9979+  return index;
9980+}
9981+
9982+static inline int ElementOptSubReluNum1NEON(int index, const float *in0, const float *in1, float *out,
9983+                                                          int size) {
9984+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
9985+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9986+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
9987+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f);
9988+    SIMD_ST_F32(out + index, vout);
9989+  }
9990+  return index;
9991+}
9992+
9993+static inline int ElementOptSubRelu6Num0NEON(int index, const float *in0, const float *in1, float *out,
9994+                                                           int size) {
9995+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
9996+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
9997+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
9998+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f);
9999+    SIMD_ST_F32(out + index, vout);
10000+  }
10001+  return index;
10002+}
10003+
10004+static inline int ElementOptSubRelu6Num1NEON(int index, const float *in0, const float *in1, float *out,
10005+                                                           int size) {
10006+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
10007+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10008+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
10009+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f);
10010+    SIMD_ST_F32(out + index, vout);
10011+  }
10012+  return index;
10013+}
10014+
10015+static inline int ElementSubNEON(int index, const float *in0, const float *in1, float *out, int size) {
10016+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10017+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
10018+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
10019+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1);
10020+    SIMD_ST_F32(out + index, vout);
10021+  }
10022+  return index;
10023+}
10024+
10025+static inline int ElementSubIntNEON(int index, const int *in0, const int *in1, int *out, int size) {
10026+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10027+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
10028+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
10029+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1);
10030+    SIMD_ST_EPI32(out + index, vout);
10031+  }
10032+  return index;
10033+}
10034+
10035+static inline int ElementSubReluNEON(int index, const float *in0, const float *in1, float *out,
10036+                                                   int size) {
10037+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10038+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
10039+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
10040+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f);
10041+    SIMD_ST_F32(out + index, vout);
10042+  }
10043+  return index;
10044+}
10045+
10046+static inline int ElementSubRelu6NEON(int index, const float *in0, const float *in1, float *out,
10047+                                                    int size) {
10048+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10049+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
10050+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
10051+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f);
10052+    SIMD_ST_F32(out + index, vout);
10053+  }
10054+  return index;
10055+}
10056+
10057+#undef MS_SIMD_INSTRUCTION
10058+#undef BLOCK_NUM
10059+
10060+#undef MS_SIMD_NEON
10061+#ifdef __cplusplus
10062+};
10063+#endif
10064+#endif
10065diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h
10066new file mode 100644
10067index 00000000..75bda800
10068--- /dev/null
10069+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h
10070@@ -0,0 +1,36 @@
10071+/**
10072+ * Copyright 2022 Huawei Technologies Co., Ltd
10073+ *
10074+ * Licensed under the Apache License, Version 2.0 (the "License");
10075+ * you may not use this file except in compliance with the License.
10076+ * You may obtain a copy of the License at
10077+ *
10078+ * http://www.apache.org/licenses/LICENSE-2.0
10079+ *
10080+ * Unless required by applicable law or agreed to in writing, software
10081+ * distributed under the License is distributed on an "AS IS" BASIS,
10082+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10083+ * See the License for the specific language governing permissions and
10084+ * limitations under the License.
10085+ */
10086+#ifndef MINDSPORE_NNACL_POOLING_FP32_SIMD_H_
10087+#define MINDSPORE_NNACL_POOLING_FP32_SIMD_H_
10088+
10089+#include "nnacl/intrinsics/ms_simd_instructions.h"
10090+#ifdef ENABLE_AVX512
10091+#include "nnacl/avx512/pooling_fp32_avx512.h"
10092+#endif
10093+
10094+#ifdef ENABLE_AVX
10095+#include "nnacl/avx/pooling_fp32_avx.h"
10096+#endif
10097+
10098+#ifdef ENABLE_SSE
10099+#include "nnacl/sse/pooling_fp32_sse.h"
10100+#endif
10101+
10102+#ifdef ENABLE_ARM
10103+#include "nnacl/neon/pooling_fp32_neon.h"
10104+#endif
10105+
10106+#endif
10107diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h
10108new file mode 100644
10109index 00000000..15e9f009
10110--- /dev/null
10111+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h
10112@@ -0,0 +1,36 @@
10113+/**
10114+ * Copyright 2022 Huawei Technologies Co., Ltd
10115+ *
10116+ * Licensed under the Apache License, Version 2.0 (the "License");
10117+ * you may not use this file except in compliance with the License.
10118+ * You may obtain a copy of the License at
10119+ *
10120+ * http://www.apache.org/licenses/LICENSE-2.0
10121+ *
10122+ * Unless required by applicable law or agreed to in writing, software
10123+ * distributed under the License is distributed on an "AS IS" BASIS,
10124+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10125+ * See the License for the specific language governing permissions and
10126+ * limitations under the License.
10127+ */
10128+#ifndef MINDSPORE_NNACL_POWER_FP32_SIMD_H_
10129+#define MINDSPORE_NNACL_POWER_FP32_SIMD_H_
10130+
10131+#include "nnacl/intrinsics/ms_simd_instructions.h"
10132+#ifdef ENABLE_AVX512
10133+#include "nnacl/avx512/power_fp32_avx512.h"
10134+#endif
10135+
10136+#ifdef ENABLE_AVX
10137+#include "nnacl/avx/power_fp32_avx.h"
10138+#endif
10139+
10140+#ifdef ENABLE_SSE
10141+#include "nnacl/sse/power_fp32_sse.h"
10142+#endif
10143+
10144+#ifdef ENABLE_ARM
10145+#include "nnacl/neon/power_fp32_neon.h"
10146+#endif
10147+
10148+#endif
10149diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h
10150new file mode 100644
10151index 00000000..60d0cd85
10152--- /dev/null
10153+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h
10154@@ -0,0 +1,36 @@
10155+/**
10156+ * Copyright 2022 Huawei Technologies Co., Ltd
10157+ *
10158+ * Licensed under the Apache License, Version 2.0 (the "License");
10159+ * you may not use this file except in compliance with the License.
10160+ * You may obtain a copy of the License at
10161+ *
10162+ * http://www.apache.org/licenses/LICENSE-2.0
10163+ *
10164+ * Unless required by applicable law or agreed to in writing, software
10165+ * distributed under the License is distributed on an "AS IS" BASIS,
10166+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10167+ * See the License for the specific language governing permissions and
10168+ * limitations under the License.
10169+ */
10170+#ifndef MINDSPORE_NNACL_REDUCE_FP32_SIMD_H_
10171+#define MINDSPORE_NNACL_REDUCE_FP32_SIMD_H_
10172+
10173+#include "nnacl/intrinsics/ms_simd_instructions.h"
10174+#ifdef ENABLE_AVX512
10175+#include "nnacl/avx512/reduce_fp32_avx512.h"
10176+#endif
10177+
10178+#ifdef ENABLE_AVX
10179+#include "nnacl/avx/reduce_fp32_avx.h"
10180+#endif
10181+
10182+#ifdef ENABLE_SSE
10183+#include "nnacl/sse/reduce_fp32_sse.h"
10184+#endif
10185+
10186+#ifdef ENABLE_ARM
10187+#include "nnacl/neon/reduce_fp32_neon.h"
10188+#endif
10189+
10190+#endif
10191diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h
10192new file mode 100644
10193index 00000000..524668ab
10194--- /dev/null
10195+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h
10196@@ -0,0 +1,36 @@
10197+/**
10198+ * Copyright 2022 Huawei Technologies Co., Ltd
10199+ *
10200+ * Licensed under the Apache License, Version 2.0 (the "License");
10201+ * you may not use this file except in compliance with the License.
10202+ * You may obtain a copy of the License at
10203+ *
10204+ * http://www.apache.org/licenses/LICENSE-2.0
10205+ *
10206+ * Unless required by applicable law or agreed to in writing, software
10207+ * distributed under the License is distributed on an "AS IS" BASIS,
10208+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10209+ * See the License for the specific language governing permissions and
10210+ * limitations under the License.
10211+ */
10212+#ifndef MINDSPORE_NNACL_SOFTMAX_FP32_SIMD_H_
10213+#define MINDSPORE_NNACL_SOFTMAX_FP32_SIMD_H_
10214+
10215+#include "nnacl/intrinsics/ms_simd_instructions.h"
10216+#ifdef ENABLE_AVX512
10217+#include "nnacl/avx512/softmax_fp32_avx512.h"
10218+#endif
10219+
10220+#ifdef ENABLE_AVX
10221+#include "nnacl/avx/softmax_fp32_avx.h"
10222+#endif
10223+
10224+#ifdef ENABLE_SSE
10225+#include "nnacl/sse/softmax_fp32_sse.h"
10226+#endif
10227+
10228+#ifdef ENABLE_ARM
10229+#include "nnacl/neon/softmax_fp32_neon.h"
10230+#endif
10231+
10232+#endif
10233diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h
10234new file mode 100644
10235index 00000000..192fc66d
10236--- /dev/null
10237+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h
10238@@ -0,0 +1,221 @@
10239+/**
10240+ * Copyright 2022 Huawei Technologies Co., Ltd
10241+ *
10242+ * Licensed under the Apache License, Version 2.0 (the "License");
10243+ * you may not use this file except in compliance with the License.
10244+ * You may obtain a copy of the License at
10245+ *
10246+ * http://www.apache.org/licenses/LICENSE-2.0
10247+ *
10248+ * Unless required by applicable law or agreed to in writing, software
10249+ * distributed under the License is distributed on an "AS IS" BASIS,
10250+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10251+ * See the License for the specific language governing permissions and
10252+ * limitations under the License.
10253+ */
10254+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
10255+#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
10256+
10257+#include "nnacl/intrinsics/ms_simd_instructions.h"
10258+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
10259+
10260+#ifdef __cplusplus
10261+extern "C" {
10262+#endif
10263+#pragma GCC push_options
10264+#pragma GCC target("sse4.1")
10265+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
10266+#define BLOCK_NUM 4
10267+#define MS_SIMD_SSE
10268+
10269+static inline int Fp32ReluSSE(int index, const float *src, int length, float *dst) {
10270+    SIMD_F32 zero = SIMD_SET0_F32;
10271+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10272+        SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero));
10273+    }
10274+    return index;
10275+}
10276+
10277+static inline int Int32ReluSSE(int index, const int32_t *src, int length, int32_t *dst) {
10278+    SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f);
10279+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10280+        SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero));
10281+    }
10282+    return index;
10283+}
10284+
10285+static inline int Fp32Relu6SSE(int index, const float *src, int length, float *dst) {
10286+    SIMD_F32 zero = SIMD_SET0_F32;
10287+    SIMD_F32 six = SIMD_MOV_F32(6.0f);
10288+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10289+        SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six));
10290+    }
10291+    return index;
10292+}
10293+
10294+static inline int LReluSSE(int index, const float *src, int length, float *dst, float alpha) {
10295+    SIMD_F32 alpha_data = SIMD_MOV_F32(alpha);
10296+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10297+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
10298+        SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp);
10299+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask));
10300+    }
10301+    return index;
10302+}
10303+
10304+static inline int SigmoidSSE(int index, const float *src, int length, float *dst) {
10305+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10306+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index);
10307+        SIMD_ST_F32(dst + index,
10308+                    SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
10309+    }
10310+    return index;
10311+}
10312+
10313+static inline int TanhSSE(int index, const float *src, int length, float *dst) {
10314+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10315+        SIMD_F32 input = SIMD_LD_F32(src + index);
10316+        SIMD_ST_F32(dst + index, SIMD_TANH_F32(input));
10317+    }
10318+    return index;
10319+}
10320+
10321+static inline int SwishSSE(int index, const float *src, int length, float *dst) {
10322+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10323+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
10324+        SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index);
10325+        SIMD_ST_F32(dst + index,
10326+                    SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index))));
10327+    }
10328+    return index;
10329+}
10330+
10331+static inline int HSwishSSE(int index, const float *src, int length, float *dst) {
10332+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10333+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
10334+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
10335+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6));
10336+    }
10337+    return index;
10338+}
10339+
10340+static inline int HSigmoidSSE(int index, const float *src, int length, float *dst) {
10341+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10342+        SIMD_F32 src_value = SIMD_LD_F32(src + index);
10343+        SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6);
10344+        SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6));
10345+    }
10346+    return index;
10347+}
10348+
10349+static inline int HardTanhNoLimitMinSSE(int index, const float *src, int length, float *dst, float min_val,
10350+                                            float max_val) {
10351+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10352+        SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val));
10353+    }
10354+    return index;
10355+}
10356+
10357+static inline int HardTanhNoLimitMaxSSE(int index, const float *src, int length, float *dst, float min_val,
10358+                                            float max_val) {
10359+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10360+        SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val));
10361+    }
10362+    return index;
10363+}
10364+
10365+static inline int HardTanhLimitMinMaxSSE(int index, const float *src, int length, float *dst, float min_val,
10366+                                             float max_val) {
10367+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10368+        SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val));
10369+    }
10370+    return index;
10371+}
10372+
10373+static inline int GeluApproximateSSE(int index, const float *src, int length, float *dst) {
10374+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10375+        SIMD_F32 in = SIMD_LD_F32(src + index);
10376+        SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in);
10377+        SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in);
10378+        SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f)));
10379+    }
10380+    return index;
10381+}
10382+
10383+static inline int GeluSSE(int index, const float *src, int length, float *dst) {
10384+    SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f);
10385+    SIMD_F32 para2 = SIMD_MOV_F32(1.0f);
10386+    SIMD_F32 para3 = SIMD_MOV_F32(0.5f);
10387+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10388+      SIMD_F32 in = SIMD_LD_F32(src + index);
10389+      SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1))));
10390+      SIMD_ST_F32(dst + index, res);
10391+    }
10392+    return index;
10393+}
10394+
10395+static inline int EluSSE(int index, const float *src, int length, float *dst, float alpha) {
10396+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10397+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
10398+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f);
10399+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
10400+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
10401+    }
10402+    return index;
10403+}
10404+
10405+static inline int CeluSSE(int index, const float *src, int length, float *dst, float alpha) {
10406+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10407+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
10408+        SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f);
10409+        SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32);
10410+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask));
10411+    }
10412+    return index;
10413+}
10414+
10415+static inline int HShrinkSSE(int index, const float *src, int length, float *dst, float lambd) {
10416+    const float neg_lambd = -1 * lambd;
10417+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10418+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
10419+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd));
10420+        SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp);
10421+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
10422+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask));
10423+    }
10424+    return index;
10425+}
10426+
10427+static inline int SoftShrinkSSE(int index, const float *src, int length, float *dst, float lambd) {
10428+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
10429+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
10430+
10431+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10432+        SIMD_F32 src_t = SIMD_LD_F32(src + index);
10433+        /* v0 = (in > lamdb) & (in - lamdb) */
10434+        SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v));
10435+        /* v1 = (in < -lamdb) & (in + lamdb) */
10436+        SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v));
10437+        /* out = (v0 | v1) */
10438+        SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1));
10439+    }
10440+    return index;
10441+}
10442+
10443+static inline int SoftsignFp32OptSSE(int index, const float *src, int length, float *dst) {
10444+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10445+        SIMD_F32 src_tmp = SIMD_LD_F32(src + index);
10446+        SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp));
10447+        SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp));
10448+    }
10449+    return index;
10450+}
10451+
10452+#undef MS_SIMD_INSTRUCTION
10453+#undef BLOCK_NUM
10454+#pragma GCC pop_options
10455+#undef MS_SIMD_SSE
10456+#ifdef __cplusplus
10457+}
10458+#endif
10459+#endif
10460diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h
10461new file mode 100644
10462index 00000000..85996f69
10463--- /dev/null
10464+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h
10465@@ -0,0 +1,57 @@
10466+/**
10467+ * Copyright 2022 Huawei Technologies Co., Ltd
10468+ *
10469+ * Licensed under the Apache License, Version 2.0 (the "License");
10470+ * you may not use this file except in compliance with the License.
10471+ * You may obtain a copy of the License at
10472+ *
10473+ * http://www.apache.org/licenses/LICENSE-2.0
10474+ *
10475+ * Unless required by applicable law or agreed to in writing, software
10476+ * distributed under the License is distributed on an "AS IS" BASIS,
10477+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10478+ * See the License for the specific language governing permissions and
10479+ * limitations under the License.
10480+ */
10481+
10482+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_SSE_H_
10483+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_SSE_H_
10484+
10485+#include "nnacl/intrinsics/ms_simd_instructions.h"
10486+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
10487+
10488+#ifdef __cplusplus
10489+extern "C" {
10490+#endif
10491+#pragma GCC push_options
10492+#pragma GCC target("sse4.1")
10493+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
10494+#define BLOCK_NUM 4
10495+#define MS_SIMD_SSE
10496+
10497+static inline int ShrinkGradSSE(int index, const float *src0, const float *src1,
10498+                                               int length, float *dst, float lambd) {
10499+    SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd);
10500+    SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd);
10501+
10502+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10503+        SIMD_F32 src0_t = SIMD_LD_F32(src0 + index);
10504+        SIMD_F32 src1_t = SIMD_LD_F32(src1 + index);
10505+
10506+        SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v);
10507+        SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t);
10508+        SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1);
10509+
10510+        SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask));
10511+    }
10512+    return index;
10513+}
10514+
10515+#undef MS_SIMD_INSTRUCTION
10516+#undef BLOCK_NUM
10517+#pragma GCC pop_options
10518+#undef MS_SIMD_SSE
10519+#ifdef __cplusplus
10520+}
10521+#endif
10522+#endif
10523diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h
10524new file mode 100644
10525index 00000000..1f5291a4
10526--- /dev/null
10527+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h
10528@@ -0,0 +1,210 @@
10529+/**
10530+ * Copyright 2022 Huawei Technologies Co., Ltd
10531+ *
10532+ * Licensed under the Apache License, Version 2.0 (the "License");
10533+ * you may not use this file except in compliance with the License.
10534+ * You may obtain a copy of the License at
10535+ *
10536+ * http://www.apache.org/licenses/LICENSE-2.0
10537+ *
10538+ * Unless required by applicable law or agreed to in writing, software
10539+ * distributed under the License is distributed on an "AS IS" BASIS,
10540+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10541+ * See the License for the specific language governing permissions and
10542+ * limitations under the License.
10543+ */
10544+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_SSE_H_
10545+#define MINDSPORE_NNACL_FP32_ADAM_FP32_SSE_H_
10546+
10547+#include "nnacl/intrinsics/ms_simd_instructions.h"
10548+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
10549+
10550+#ifdef __cplusplus
10551+extern "C" {
10552+#endif
10553+#pragma GCC push_options
10554+#pragma GCC target("sse4.1")
10555+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
10556+#define BLOCK_NUM 4
10557+#define MS_SIMD_SSE
10558+#ifdef MS_SIMD_AVX512
10559+  static inline size_t AdamWeightDecayFp32SSE(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
10560+    const float *gradient, size_t end) {
10561+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
10562+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
10563+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
10564+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
10565+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
10566+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
10567+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
10568+
10569+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10570+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
10571+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
10572+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
10573+    SIMD_F32 g_r = SIMD_LD_F32(gradient + index);
10574+
10575+    m_r = SIMD_MUL_F32(m_r, beta1_r);
10576+    v_r = SIMD_MUL_F32(v_r, beta2_r);
10577+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
10578+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
10579+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
10580+    avx_r0 = SIMD_SQRT_F32(v_r);
10581+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
10582+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
10583+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
10584+    SIMD_ST_F32(m + index, m_r);
10585+    SIMD_ST_F32(v + index, v_r);
10586+    SIMD_ST_F32(var + index, var_r);
10587+  }
10588+
10589+  return index;
10590+}
10591+
10592+static inline size_t FusedCastAdamFp32Fp16SSE(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
10593+    float global_norm_reciprocal, size_t end) {
10594+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
10595+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
10596+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
10597+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
10598+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
10599+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
10600+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
10601+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
10602+
10603+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10604+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
10605+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
10606+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
10607+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
10608+
10609+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
10610+    m_r = SIMD_MUL_F32(m_r, beta1_r);
10611+    v_r = SIMD_MUL_F32(v_r, beta2_r);
10612+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
10613+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
10614+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
10615+    avx_r0 = SIMD_SQRT_F32(v_r);
10616+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
10617+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
10618+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
10619+    SIMD_ST_F32(var + index, var_r);
10620+    SIMD_ST_F32(m + index, m_r);
10621+    SIMD_ST_F32(v + index, v_r);
10622+  }
10623+
10624+  return index;
10625+}
10626+
10627+static inline size_t FusedCastAdamFp32Fp32SSE(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
10628+    float global_norm_reciprocal, size_t end) {
10629+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
10630+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
10631+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
10632+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
10633+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
10634+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
10635+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
10636+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
10637+
10638+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10639+    SIMD_F32 var_r = SIMD_LD_F32(var + index);
10640+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
10641+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
10642+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
10643+
10644+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
10645+    m_r = SIMD_MUL_F32(m_r, beta1_r);
10646+    v_r = SIMD_MUL_F32(v_r, beta2_r);
10647+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
10648+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
10649+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
10650+    avx_r0 = SIMD_SQRT_F32(v_r);
10651+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
10652+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
10653+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
10654+    SIMD_ST_F32(var + index, var_r);
10655+    SIMD_ST_F32(m + index, m_r);
10656+    SIMD_ST_F32(v + index, v_r);
10657+  }
10658+
10659+  return index;
10660+}
10661+
10662+static inline size_t FusedCastAdamFp16Fp16SSE(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
10663+    float global_norm_reciprocal, size_t end) {
10664+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
10665+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
10666+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
10667+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
10668+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
10669+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
10670+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
10671+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
10672+
10673+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10674+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
10675+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
10676+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
10677+    SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index));
10678+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
10679+    m_r = SIMD_MUL_F32(m_r, beta1_r);
10680+    v_r = SIMD_MUL_F32(v_r, beta2_r);
10681+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
10682+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
10683+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
10684+    avx_r0 = SIMD_SQRT_F32(v_r);
10685+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
10686+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
10687+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
10688+    SIMD_ST_F32(m + index, m_r);
10689+    SIMD_ST_F32(v + index, v_r);
10690+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
10691+  }
10692+
10693+  return index;
10694+}
10695+
10696+static inline size_t FusedCastAdamFp16Fp32SSE(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay,
10697+    float global_norm_reciprocal, size_t end) {
10698+  SIMD_F32 beta1_r = SIMD_MOV_F32(beta1);
10699+  SIMD_F32 beta2_r = SIMD_MOV_F32(beta2);
10700+  SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1);
10701+  SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2);
10702+  SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr);
10703+  SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon);
10704+  SIMD_F32 decay_r = SIMD_MOV_F32(decay);
10705+  SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal);
10706+
10707+  for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10708+    SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16));
10709+    SIMD_F32 m_r = SIMD_LD_F32(m + index);
10710+    SIMD_F32 v_r = SIMD_LD_F32(v + index);
10711+    SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index);
10712+    g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r);
10713+    m_r = SIMD_MUL_F32(m_r, beta1_r);
10714+    v_r = SIMD_MUL_F32(v_r, beta2_r);
10715+    SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r);
10716+    m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r);
10717+    v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r);
10718+    avx_r0 = SIMD_SQRT_F32(v_r);
10719+    avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r));
10720+    avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0);
10721+    var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r);
10722+    SIMD_ST_F32(m + index, m_r);
10723+    SIMD_ST_F32(v + index, v_r);
10724+    SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0));
10725+  }
10726+
10727+  return index;
10728+}
10729+#endif
10730+
10731+#undef MS_SIMD_INSTRUCTION
10732+#undef BLOCK_NUM
10733+#pragma GCC pop_options
10734+#undef MS_SIMD_SSE
10735+#ifdef __cplusplus
10736+}
10737+#endif
10738+#endif
10739diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h
10740new file mode 100644
10741index 00000000..eb705534
10742--- /dev/null
10743+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h
10744@@ -0,0 +1,124 @@
10745+/**
10746+ * Copyright 2022 Huawei Technologies Co., Ltd
10747+ *
10748+ * Licensed under the Apache License, Version 2.0 (the "License");
10749+ * you may not use this file except in compliance with the License.
10750+ * You may obtain a copy of the License at
10751+ *
10752+ * http://www.apache.org/licenses/LICENSE-2.0
10753+ *
10754+ * Unless required by applicable law or agreed to in writing, software
10755+ * distributed under the License is distributed on an "AS IS" BASIS,
10756+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10757+ * See the License for the specific language governing permissions and
10758+ * limitations under the License.
10759+ */
10760+
10761+#ifndef MINDSPORE_NNACL_FP32_ADD_SSE_H_
10762+#define MINDSPORE_NNACL_FP32_ADD_SSE_H_
10763+
10764+#include "nnacl/intrinsics/ms_simd_instructions.h"
10765+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
10766+
10767+#ifdef __cplusplus
10768+extern "C" {
10769+#endif
10770+#pragma GCC push_options
10771+#pragma GCC target("sse4.1")
10772+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
10773+#define BLOCK_NUM 4
10774+#define MS_SIMD_SSE
10775+
10776+static inline int ElementOptAddSSE(int index, const float *in0, const float *in1, float *out, int size) {
10777+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
10778+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10779+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
10780+    SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1);
10781+    SIMD_ST_F32(out + index, vout);
10782+  }
10783+  return index;
10784+}
10785+
10786+static inline int ElementOptAddIntSSE(int index, const int *in0, const int *in1, int *out,
10787+                                                     int size) {
10788+  SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]);
10789+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10790+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
10791+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1);
10792+    SIMD_ST_EPI32(out + index, vout);
10793+  }
10794+  return index;
10795+}
10796+
10797+static inline int ElementOptAddReluSSE(int index, const float *in0, const float *in1, float *out,
10798+                                                      int size) {
10799+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
10800+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10801+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
10802+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f);
10803+    SIMD_ST_F32(out + index, vout);
10804+  }
10805+  return index;
10806+}
10807+
10808+static inline int ElementOptAddRelu6SSE(int index, const float *in0, const float *in1, float *out,
10809+                                                       int size) {
10810+  SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]);
10811+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10812+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
10813+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f);
10814+    SIMD_ST_F32(out + index, vout);
10815+  }
10816+  return index;
10817+}
10818+
10819+static inline int ElementAddSSE(int index, const float *in0, const float *in1, float *out, int size) {
10820+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10821+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
10822+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
10823+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
10824+    SIMD_ST_F32(out + index, vout);
10825+  }
10826+  return index;
10827+}
10828+
10829+static inline int ElementAddReluSSE(int index, const float *in0, const float *in1, float *out,
10830+                                                   int size) {
10831+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10832+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
10833+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
10834+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f);
10835+    SIMD_ST_F32(out + index, vout);
10836+  }
10837+  return index;
10838+}
10839+
10840+static inline int ElementAddRelu6SSE(int index, const float *in0, const float *in1, float *out,
10841+                                                    int size) {
10842+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10843+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
10844+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
10845+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f);
10846+    SIMD_ST_F32(out + index, vout);
10847+  }
10848+  return index;
10849+}
10850+
10851+static inline int ElementAddIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
10852+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10853+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
10854+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
10855+    SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1);
10856+    SIMD_ST_EPI32(out + index, vout);
10857+  }
10858+  return index;
10859+}
10860+
10861+#undef MS_SIMD_INSTRUCTION
10862+#undef BLOCK_NUM
10863+#pragma GCC pop_options
10864+#undef MS_SIMD_SSE
10865+#ifdef __cplusplus
10866+}
10867+#endif
10868+#endif
10869diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h
10870new file mode 100644
10871index 00000000..173890b4
10872--- /dev/null
10873+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h
10874@@ -0,0 +1,254 @@
10875+/**
10876+ * Copyright 2022 Huawei Technologies Co., Ltd
10877+ *
10878+ * Licensed under the Apache License, Version 2.0 (the "License");
10879+ * you may not use this file except in compliance with the License.
10880+ * You may obtain a copy of the License at
10881+ *
10882+ * http://www.apache.org/licenses/LICENSE-2.0
10883+ *
10884+ * Unless required by applicable law or agreed to in writing, software
10885+ * distributed under the License is distributed on an "AS IS" BASIS,
10886+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10887+ * See the License for the specific language governing permissions and
10888+ * limitations under the License.
10889+ */
10890+
10891+#ifndef MINDSPORE_NNACL_ARITHMETIC_SSE_H_
10892+#define MINDSPORE_NNACL_ARITHMETIC_SSE_H_
10893+
10894+#include "nnacl/intrinsics/ms_simd_instructions.h"
10895+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
10896+
10897+#ifdef __cplusplus
10898+extern "C" {
10899+#endif
10900+#pragma GCC push_options
10901+#pragma GCC target("sse4.1")
10902+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
10903+#define BLOCK_NUM 4
10904+#define MS_SIMD_SSE
10905+
10906+#ifndef MS_SIMD_NEON
10907+static inline int ElementFloorModSSE(int index, const float *in0, const float *in1, float *out, int size) {
10908+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10909+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
10910+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
10911+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
10912+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
10913+    SIMD_ST_F32(out + index, out_tmp);
10914+  }
10915+  return index;
10916+}
10917+
10918+static inline int ElementOptFloorModNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
10919+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
10920+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10921+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
10922+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
10923+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
10924+    SIMD_ST_F32(out + index, out_tmp);
10925+  }
10926+  return index;
10927+}
10928+
10929+static inline int ElementOptFloorModNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
10930+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
10931+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10932+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
10933+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
10934+    SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp));
10935+    SIMD_ST_F32(out + index, out_tmp);
10936+  }
10937+  return index;
10938+}
10939+
10940+static inline int ElementFloorDivSSE(int index, const float *in0, const float *in1, float *out, int size) {
10941+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10942+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
10943+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
10944+    SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
10945+    SIMD_ST_F32(out + index, floor_tmp);
10946+  }
10947+  return index;
10948+}
10949+
10950+static inline int ElementOptFloorDivNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
10951+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
10952+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10953+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
10954+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
10955+    SIMD_ST_F32(out + index, out_tmp);
10956+  }
10957+  return index;
10958+}
10959+
10960+static inline int ElementOptFloorDivNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
10961+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
10962+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10963+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
10964+    SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp));
10965+    SIMD_ST_F32(out + index, out_tmp);
10966+  }
10967+  return index;
10968+}
10969+#endif
10970+
10971+static inline int ElementFloorDivIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
10972+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10973+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
10974+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
10975+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
10976+    SIMD_ST_EPI32(out + index, out_tmp);
10977+  }
10978+  return index;
10979+}
10980+
10981+static inline int ElementOptFloorDivIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
10982+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
10983+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10984+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
10985+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
10986+    SIMD_ST_EPI32(out + index, out_tmp);
10987+  }
10988+  return index;
10989+}
10990+
10991+static inline int ElementOptFloorDivIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
10992+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
10993+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
10994+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
10995+    SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp);
10996+    SIMD_ST_EPI32(out + index, out_tmp);
10997+  }
10998+  return index;
10999+}
11000+
11001+static inline int ElementMaximumSSE(int index, const float *in0, const float *in1, float *out, int size) {
11002+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11003+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
11004+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
11005+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
11006+    SIMD_ST_F32(out + index, out_tmp);
11007+  }
11008+  return index;
11009+}
11010+
11011+static inline int ElementOptMaximumNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
11012+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
11013+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11014+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
11015+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
11016+    SIMD_ST_F32(out + index, out_tmp);
11017+  }
11018+  return index;
11019+}
11020+
11021+static inline int ElementOptMaximumNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
11022+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
11023+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11024+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
11025+    SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp);
11026+    SIMD_ST_F32(out + index, out_tmp);
11027+  }
11028+  return index;
11029+}
11030+
11031+static inline int ElementMaximumIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
11032+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11033+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
11034+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
11035+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
11036+    SIMD_ST_EPI32(out + index, out_tmp);
11037+  }
11038+  return index;
11039+}
11040+
11041+static inline int ElementOptMaximumIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
11042+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
11043+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11044+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
11045+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
11046+    SIMD_ST_EPI32(out + index, out_tmp);
11047+  }
11048+  return index;
11049+}
11050+
11051+static inline int ElementOptMaximumIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
11052+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
11053+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11054+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
11055+    SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp);
11056+    SIMD_ST_EPI32(out + index, out_tmp);
11057+  }
11058+  return index;
11059+}
11060+
11061+static inline int ElementMinimumIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
11062+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11063+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
11064+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
11065+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
11066+    SIMD_ST_EPI32(out + index, out_tmp);
11067+  }
11068+  return index;
11069+}
11070+
11071+static inline int ElementOptMinimumIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
11072+  SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]);
11073+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11074+    SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index);
11075+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
11076+    SIMD_ST_EPI32(out + index, out_tmp);
11077+  }
11078+  return index;
11079+}
11080+
11081+static inline int ElementOptMinimumIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
11082+  SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]);
11083+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11084+    SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index);
11085+    SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp);
11086+    SIMD_ST_EPI32(out + index, out_tmp);
11087+  }
11088+  return index;
11089+}
11090+
11091+static inline int ElementMinimumSSE(int index, const float *in0, const float *in1, float *out, int size) {
11092+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11093+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
11094+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
11095+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
11096+    SIMD_ST_F32(out + index, out_tmp);
11097+  }
11098+  return index;
11099+}
11100+
11101+static inline int ElementOptMinimumNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
11102+  SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]);
11103+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11104+    SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index);
11105+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
11106+    SIMD_ST_F32(out + index, out_tmp);
11107+  }
11108+  return index;
11109+}
11110+
11111+static inline int ElementOptMinimumNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
11112+  SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]);
11113+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11114+    SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index);
11115+    SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp);
11116+    SIMD_ST_F32(out + index, out_tmp);
11117+  }
11118+  return index;
11119+}
11120+
11121+#undef MS_SIMD_INSTRUCTION
11122+#undef BLOCK_NUM
11123+#pragma GCC pop_options
11124+#undef MS_SIMD_SSE
11125+#ifdef __cplusplus
11126+}
11127+#endif
11128+#endif
11129diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h
11130new file mode 100644
11131index 00000000..0a1d21c2
11132--- /dev/null
11133+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h
11134@@ -0,0 +1,129 @@
11135+/**
11136+ * Copyright 2022 Huawei Technologies Co., Ltd
11137+ *
11138+ * Licensed under the Apache License, Version 2.0 (the "License");
11139+ * you may not use this file except in compliance with the License.
11140+ * You may obtain a copy of the License at
11141+ *
11142+ * http://www.apache.org/licenses/LICENSE-2.0
11143+ *
11144+ * Unless required by applicable law or agreed to in writing, software
11145+ * distributed under the License is distributed on an "AS IS" BASIS,
11146+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11147+ * See the License for the specific language governing permissions and
11148+ * limitations under the License.
11149+ */
11150+
11151+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_SSE_H_
11152+#define MINDSPORE_NNACL_ARITHMETIC_SELF_SSE_H_
11153+
11154+#include "nnacl/intrinsics/ms_simd_instructions.h"
11155+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11156+
11157+#ifdef __cplusplus
11158+extern "C" {
11159+#endif
11160+#pragma GCC push_options
11161+#pragma GCC target("sse4.1")
11162+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
11163+#define BLOCK_NUM 4
11164+#define MS_SIMD_SSE
11165+
11166+#if defined(MS_SIMD_AVX512)
11167+// only avx512 support abs fp32 instruction
11168+static inline int ElementAbsSSE(int index, const float *input, float *output, const int element_size) {
11169+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11170+    SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index)));
11171+  }
11172+  return index;
11173+}
11174+
11175+static inline int ElementAbsIntSSE(int index, const int *input, int *output, const int element_size) {
11176+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11177+    SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index)));
11178+  }
11179+  return index;
11180+}
11181+#endif
11182+
11183+static inline int ElementSquareSSE(int index, const float *input, float *output, const int element_size) {
11184+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11185+    SIMD_F32 vin = SIMD_LD_F32(input + index);
11186+    SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin));
11187+  }
11188+  return index;
11189+}
11190+
11191+static inline int ElementSqrtSSE(int index, const float *input, float *output, const int element_size) {
11192+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11193+    SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index)));
11194+  }
11195+  return index;
11196+}
11197+
11198+static inline int ElementRsqrtSSE(int index, const float *input, float *output, const int element_size) {
11199+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11200+    SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index)));
11201+  }
11202+  return index;
11203+}
11204+
11205+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE)
11206+// avx512 dont support round fp32 instruction
11207+static inline int ElementRoundSSE(int index, const float *input, float *output, const int element_size) {
11208+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11209+    SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index)));
11210+  }
11211+  return index;
11212+}
11213+#endif
11214+
11215+#ifndef MS_SIMD_NEON
11216+// neon dont support floor fp32 instruction
11217+static inline int ElementFloorSSE(int index, const float *input, float *output, const int element_size) {
11218+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11219+    SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index)));
11220+  }
11221+  return index;
11222+}
11223+#endif
11224+
11225+#ifndef MS_SIMD_NEON
11226+static inline int ElementCeilSSE(int index, const float *input, float *output, const int element_size) {
11227+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11228+    SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index)));
11229+  }
11230+  return index;
11231+}
11232+#endif
11233+
11234+static inline int ElementNegativeSSE(int index, const float *input, float *output, const int element_size) {
11235+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11236+    SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f));
11237+  }
11238+  return index;
11239+}
11240+
11241+static inline int ElementNegativeIntSSE(int index, const int *input, int *output, const int element_size) {
11242+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11243+    SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1));
11244+  }
11245+  return index;
11246+}
11247+
11248+static inline int ElementReciprocalSSE(int index, const float *input, float *output, const int element_size) {
11249+  SIMD_F32 num1 = SIMD_MOV_F32(1.0f);
11250+  for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11251+    SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index)));
11252+  }
11253+  return index;
11254+}
11255+
11256+#undef MS_SIMD_INSTRUCTION
11257+#undef BLOCK_NUM
11258+#pragma GCC pop_options
11259+#undef MS_SIMD_SSE
11260+#ifdef __cplusplus
11261+}
11262+#endif
11263+#endif
11264diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h
11265new file mode 100644
11266index 00000000..f04b4e1f
11267--- /dev/null
11268+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h
11269@@ -0,0 +1,67 @@
11270+/**
11271+ * Copyright 2022 Huawei Technologies Co., Ltd
11272+ *
11273+ * Licensed under the Apache License, Version 2.0 (the "License");
11274+ * you may not use this file except in compliance with the License.
11275+ * You may obtain a copy of the License at
11276+ *
11277+ * http://www.apache.org/licenses/LICENSE-2.0
11278+ *
11279+ * Unless required by applicable law or agreed to in writing, software
11280+ * distributed under the License is distributed on an "AS IS" BASIS,
11281+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11282+ * See the License for the specific language governing permissions and
11283+ * limitations under the License.
11284+ */
11285+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
11286+#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
11287+
11288+#include "nnacl/intrinsics/ms_simd_instructions.h"
11289+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11290+
11291+#ifdef __cplusplus
11292+extern "C" {
11293+#endif
11294+#pragma GCC push_options
11295+#pragma GCC target("sse4.1")
11296+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
11297+#define BLOCK_NUM 4
11298+#define MS_SIMD_SSE
11299+
11300+static inline int BatchNormFp32SSE(int index, const float *input, const float *mean,
11301+  const float *variance, int channel, float epsilon, float *output) {
11302+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11303+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
11304+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
11305+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
11306+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
11307+    SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
11308+    SIMD_ST_F32(output + index, output_data);
11309+  }
11310+  return index;
11311+}
11312+
11313+static inline int FusedBatchNormFp32SSE(int index, const float *input, const float *scale,
11314+  const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) {
11315+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11316+    SIMD_F32 input_data = SIMD_LD_F32(input + index);
11317+    SIMD_F32 scale_ = SIMD_LD_F32(scale + index);
11318+    SIMD_F32 offset_ = SIMD_LD_F32(offset + index);
11319+    SIMD_F32 mean_ = SIMD_LD_F32(mean + index);
11320+    SIMD_F32 variance_ = SIMD_LD_F32(variance + index);
11321+    SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon)));
11322+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt);
11323+    SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_);
11324+    SIMD_ST_F32(output + index, output_data);
11325+  }
11326+  return index;
11327+}
11328+
11329+#undef MS_SIMD_INSTRUCTION
11330+#undef BLOCK_NUM
11331+#pragma GCC pop_options
11332+#undef MS_SIMD_SSE
11333+#ifdef __cplusplus
11334+}
11335+#endif
11336+#endif
11337diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h
11338new file mode 100644
11339index 00000000..c929ccaf
11340--- /dev/null
11341+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h
11342@@ -0,0 +1,69 @@
11343+/**
11344+ * Copyright 2022 Huawei Technologies Co., Ltd
11345+ *
11346+ * Licensed under the Apache License, Version 2.0 (the "License");
11347+ * you may not use this file except in compliance with the License.
11348+ * You may obtain a copy of the License at
11349+ *
11350+ * http://www.apache.org/licenses/LICENSE-2.0
11351+ *
11352+ * Unless required by applicable law or agreed to in writing, software
11353+ * distributed under the License is distributed on an "AS IS" BASIS,
11354+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11355+ * See the License for the specific language governing permissions and
11356+ * limitations under the License.
11357+ */
11358+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_SSE_H_
11359+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_SSE_H_
11360+
11361+#include "nnacl/intrinsics/ms_simd_instructions.h"
11362+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11363+
11364+#ifdef __cplusplus
11365+extern "C" {
11366+#endif
11367+#pragma GCC push_options
11368+#pragma GCC target("sse4.1")
11369+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
11370+#define BLOCK_NUM 4
11371+#define MS_SIMD_SSE
11372+
11373+static inline int BCEWithLogitLossSSE(int index, const float *logits, const float *label,
11374+    const float *weight, const float *pos_weight, int length, bool reduction, float *output,
11375+    float *reduction_sum) {
11376+    SIMD_F32 zero = SIMD_SET0_F32;
11377+    SIMD_F32 ones = SIMD_MOV_F32(1.0f);
11378+    SIMD_F32 middle_output = SIMD_SET0_F32;
11379+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11380+      SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index);
11381+      SIMD_F32 label_tmp = SIMD_LD_F32(label + index);
11382+      SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index);
11383+      SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index);
11384+      SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp);
11385+      SIMD_F32 max_value = neg_logits_tmp;
11386+      max_value = SIMD_MIN_F32(max_value, zero);
11387+      SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value);
11388+      SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones);
11389+      SIMD_F32 log_exp_value =
11390+        SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value))));
11391+      SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp),
11392+                                    SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value)));
11393+      if (reduction) {
11394+        middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output);
11395+      } else {
11396+        SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp));
11397+      }
11398+    }
11399+    if (reduction) {
11400+      *reduction_sum += SIMD_GET_SUM_F32(middle_output);
11401+    }
11402+    return index;
11403+}
11404+#undef MS_SIMD_INSTRUCTION
11405+#undef BLOCK_NUM
11406+#pragma GCC pop_options
11407+#undef MS_SIMD_SSE
11408+#ifdef __cplusplus
11409+}
11410+#endif
11411+#endif
11412diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h
11413new file mode 100644
11414index 00000000..0544d239
11415--- /dev/null
11416+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h
11417@@ -0,0 +1,64 @@
11418+/**
11419+ * Copyright 2022 Huawei Technologies Co., Ltd
11420+ *
11421+ * Licensed under the Apache License, Version 2.0 (the "License");
11422+ * you may not use this file except in compliance with the License.
11423+ * You may obtain a copy of the License at
11424+ *
11425+ * http://www.apache.org/licenses/LICENSE-2.0
11426+ *
11427+ * Unless required by applicable law or agreed to in writing, software
11428+ * distributed under the License is distributed on an "AS IS" BASIS,
11429+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11430+ * See the License for the specific language governing permissions and
11431+ * limitations under the License.
11432+ */
11433+
11434+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_SSE_H_
11435+#define MINDSPORE_NNACL_FP32_BIAS_ADD_SSE_H_
11436+
11437+#include "nnacl/intrinsics/ms_simd_instructions.h"
11438+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11439+
11440+#ifdef __cplusplus
11441+extern "C" {
11442+#endif
11443+#pragma GCC push_options
11444+#pragma GCC target("sse4.1")
11445+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
11446+#define BLOCK_NUM 4
11447+#define MS_SIMD_SSE
11448+
11449+static inline int BiasAddByInnerCoreSSE(int index, const float *input, const float *bias, float *output,
11450+                                                       int64_t num) {
11451+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11452+    SIMD_F32 vin0 = SIMD_LD_F32(input + index);
11453+    SIMD_F32 vin1 = SIMD_LD_F32(bias + index);
11454+    SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1);
11455+    SIMD_ST_F32(output + index, vout);
11456+  }
11457+  return index;
11458+}
11459+
11460+static inline int BiasAddByBatchCoreSSE(int index, const float *input, const float *bias, float *output1,
11461+                                                       float *output2, float *output3, float *output4, int64_t num) {
11462+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11463+    SIMD_LDX4_F32(input_data, input + index, num);
11464+    SIMD_F32 bias_data = SIMD_LD_F32(bias + index);
11465+    SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data));
11466+    SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data));
11467+    SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data));
11468+    SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data));
11469+  }
11470+  return index;
11471+}
11472+
11473+#undef MS_SIMD_INSTRUCTION
11474+#undef BLOCK_NUM
11475+#pragma GCC pop_options
11476+#undef MS_SIMD_SSE
11477+#ifdef __cplusplus
11478+};
11479+#endif
11480+
11481+#endif  // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_
11482diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h
11483new file mode 100644
11484index 00000000..4eca209f
11485--- /dev/null
11486+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h
11487@@ -0,0 +1,56 @@
11488+/**
11489+ * Copyright 2022 Huawei Technologies Co., Ltd
11490+ *
11491+ * Licensed under the Apache License, Version 2.0 (the "License");
11492+ * you may not use this file except in compliance with the License.
11493+ * You may obtain a copy of the License at
11494+ *
11495+ * http://www.apache.org/licenses/LICENSE-2.0
11496+ *
11497+ * Unless required by applicable law or agreed to in writing, software
11498+ * distributed under the License is distributed on an "AS IS" BASIS,
11499+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11500+ * See the License for the specific language governing permissions and
11501+ * limitations under the License.
11502+ */
11503+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_SSE_H_
11504+#define MINDSPORE_NNACL_BASE_CAST_BASE_SSE_H_
11505+
11506+#include "nnacl/intrinsics/ms_simd_instructions.h"
11507+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11508+
11509+#ifdef __cplusplus
11510+extern "C" {
11511+#endif
11512+#pragma GCC push_options
11513+#pragma GCC target("sse4.1")
11514+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
11515+#define BLOCK_NUM 4
11516+#define MS_SIMD_SSE
11517+
11518+static inline int Int32ToFloat32SSE(int index, const int32_t *input, float *output, int number) {
11519+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11520+    SIMD_EPI32 value = SIMD_LD_EPI32(input + index);
11521+    SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value));
11522+  }
11523+  return index;
11524+}
11525+
11526+#ifndef MS_SIMD_NEON
11527+static inline int Float32ToInt32SSE(int index, const float *input, int32_t *output, int number) {
11528+  for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11529+    SIMD_F32 value = SIMD_LD_F32(input + index);
11530+    SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value));
11531+  }
11532+  return index;
11533+}
11534+#endif
11535+
11536+#undef MS_SIMD_INSTRUCTION
11537+#undef BLOCK_NUM
11538+#pragma GCC pop_options
11539+#undef MS_SIMD_SSE
11540+#ifdef __cplusplus
11541+}
11542+#endif
11543+#endif
11544diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h
11545new file mode 100644
11546index 00000000..3d116113
11547--- /dev/null
11548+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h
11549@@ -0,0 +1,70 @@
11550+/**
11551+ * Copyright 2022 Huawei Technologies Co., Ltd
11552+ *
11553+ * Licensed under the Apache License, Version 2.0 (the "License");
11554+ * you may not use this file except in compliance with the License.
11555+ * You may obtain a copy of the License at
11556+ *
11557+ * http://www.apache.org/licenses/LICENSE-2.0
11558+ *
11559+ * Unless required by applicable law or agreed to in writing, software
11560+ * distributed under the License is distributed on an "AS IS" BASIS,
11561+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11562+ * See the License for the specific language governing permissions and
11563+ * limitations under the License.
11564+ */
11565+#ifndef MINDSPORE_NNACL_FP32_CDIST_SSE_H_
11566+#define MINDSPORE_NNACL_FP32_CDIST_SSE_H_
11567+
11568+#include "nnacl/intrinsics/ms_simd_instructions.h"
11569+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11570+
11571+#ifdef __cplusplus
11572+extern "C" {
11573+#endif
11574+#pragma GCC push_options
11575+#pragma GCC target("sse4.1")
11576+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
11577+#define BLOCK_NUM 4
11578+#define MS_SIMD_SSE
11579+
11580+static inline int64_t CdistTwoNormalOptSSE(int64_t index, const float *a, const float *b,
11581+                                                          float *out, int64_t size) {
11582+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
11583+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11584+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
11585+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
11586+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
11587+    tmp_vec = SIMD_ABS_F32(tmp_vec);
11588+    result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec);
11589+  }
11590+  *out += SIMD_GET_SUM_F32(result_vec);
11591+
11592+  return index;
11593+}
11594+
11595+static inline int64_t CdistPNormalOptSSE(int64_t index, const float *a, const float *b,
11596+                                                        float *out, int64_t size, float p) {
11597+  SIMD_F32 result_vec = SIMD_MOV_F32(0.0f);
11598+  SIMD_F32 p_vec = SIMD_MOV_F32(p);
11599+  for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11600+    SIMD_F32 a_vec = SIMD_LD_F32(a + index);
11601+    SIMD_F32 b_vec = SIMD_LD_F32(b + index);
11602+    SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec);
11603+    tmp_vec = SIMD_ABS_F32(tmp_vec);
11604+    tmp_vec = SIMD_POW_F32(tmp_vec, p_vec);
11605+    result_vec = SIMD_ADD_F32(tmp_vec, result_vec);
11606+  }
11607+  *out += SIMD_GET_SUM_F32(result_vec);
11608+
11609+  return index;
11610+}
11611+
11612+#undef MS_SIMD_INSTRUCTION
11613+#undef BLOCK_NUM
11614+#pragma GCC pop_options
11615+#undef MS_SIMD_SSE
11616+#ifdef __cplusplus
11617+}
11618+#endif
11619+#endif
11620diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h
11621new file mode 100644
11622index 00000000..1b67143f
11623--- /dev/null
11624+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h
11625@@ -0,0 +1,121 @@
11626+/**
11627+ * Copyright 2022 Huawei Technologies Co., Ltd
11628+ *
11629+ * Licensed under the Apache License, Version 2.0 (the "License");
11630+ * you may not use this file except in compliance with the License.
11631+ * You may obtain a copy of the License at
11632+ *
11633+ * http://www.apache.org/licenses/LICENSE-2.0
11634+ *
11635+ * Unless required by applicable law or agreed to in writing, software
11636+ * distributed under the License is distributed on an "AS IS" BASIS,
11637+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11638+ * See the License for the specific language governing permissions and
11639+ * limitations under the License.
11640+ */
11641+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_SSE_H_
11642+#define MINDSPORE_NNACL_FP32_CUMSUM_SSE_H_
11643+
11644+#include "nnacl/intrinsics/ms_simd_instructions.h"
11645+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11646+
11647+#ifdef __cplusplus
11648+extern "C" {
11649+#endif
11650+#pragma GCC push_options
11651+#pragma GCC target("sse4.1")
11652+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
11653+#define BLOCK_NUM 4
11654+#define MS_SIMD_SSE
11655+
11656+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
11657+// (a, b, c) -> (0, a,   a+b)    exclusive == true
11658+static inline int64_t CumsumOutputInitWithInputSSE(int64_t index, const float *layer_input,
11659+  float *layer_output, int inner_dim) {
11660+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11661+    SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index));
11662+  }
11663+  return index;
11664+}
11665+
11666+static inline int64_t CumsumOutputInitWithZeroSSE(int64_t index, float *layer_output, int inner_dim) {
11667+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11668+    SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f));
11669+  }
11670+  return index;
11671+}
11672+
11673+static inline int64_t CumsumSSE(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output,
11674+  int inner_dim) {
11675+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11676+    SIMD_F32 input_val = SIMD_LD_F32(layer_input + index);
11677+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index);
11678+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
11679+    SIMD_ST_F32(layer_output + index, out_val);
11680+  }
11681+  return index;
11682+}
11683+
11684+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
11685+// (a, b, c) -> (c+b, c, 0) exclusive==true
11686+static inline int64_t CumsumReverseSSE(int64_t index, const float *layer_input, float *layer_output,
11687+  float *layer_last_output, int inner_dim) {
11688+
11689+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11690+    SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1);
11691+    SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1);
11692+    SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val);
11693+    SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val);
11694+  }
11695+  return index;
11696+}
11697+
11698+// (a, b, c) -> (a, a+b, a+b+c)  exclusive == false
11699+// (a, b, c) -> (0, a,   a+b)    exclusive == true
11700+static inline int64_t CumsumIntOutputInitWithInputSSE(int64_t index, const int *layer_input,
11701+  int *layer_output, int inner_dim) {
11702+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11703+    SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index));
11704+  }
11705+  return index;
11706+}
11707+
11708+static inline int64_t CumsumIntOutputInitWithZeroSSE(int64_t index, int *layer_output, int inner_dim) {
11709+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11710+    SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f));
11711+  }
11712+  return index;
11713+}
11714+
11715+static inline int64_t CumsumIntSSE(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
11716+  int inner_dim) {
11717+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11718+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index);
11719+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index);
11720+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
11721+    SIMD_ST_EPI32(layer_output + index, out_val);
11722+  }
11723+  return index;
11724+}
11725+
11726+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false
11727+// (a, b, c) -> (c+b, c, 0) exclusive==true
11728+static inline int64_t CumsumReverseIntSSE(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output,
11729+  int inner_dim) {
11730+  for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11731+    SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1);
11732+    SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1);
11733+    SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val);
11734+    SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val);
11735+  }
11736+  return index;
11737+}
11738+
11739+#undef MS_SIMD_INSTRUCTION
11740+#undef BLOCK_NUM
11741+#pragma GCC pop_options
11742+#undef MS_SIMD_SSE
11743+#ifdef __cplusplus
11744+}
11745+#endif
11746+#endif
11747diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h
11748new file mode 100644
11749index 00000000..5f0c6009
11750--- /dev/null
11751+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h
11752@@ -0,0 +1,167 @@
11753+/**
11754+ * Copyright 2022 Huawei Technologies Co., Ltd
11755+ *
11756+ * Licensed under the Apache License, Version 2.0 (the "License");
11757+ * you may not use this file except in compliance with the License.
11758+ * You may obtain a copy of the License at
11759+ *
11760+ * http://www.apache.org/licenses/LICENSE-2.0
11761+ *
11762+ * Unless required by applicable law or agreed to in writing, software
11763+ * distributed under the License is distributed on an "AS IS" BASIS,
11764+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11765+ * See the License for the specific language governing permissions and
11766+ * limitations under the License.
11767+ */
11768+
11769+#ifndef MINDSPORE_NNACL_FP32_DIV_SSE_H_
11770+#define MINDSPORE_NNACL_FP32_DIV_SSE_H_
11771+
11772+#include "nnacl/intrinsics/ms_simd_instructions.h"
11773+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11774+
11775+#ifdef __cplusplus
11776+extern "C" {
11777+#endif
11778+#pragma GCC push_options
11779+#pragma GCC target("sse4.1")
11780+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
11781+#define BLOCK_NUM 4
11782+#define MS_SIMD_SSE
11783+
11784+static inline int ElementOptDivNum0SSE(int index, const float *in0, const float *in1, float *out,
11785+                                                      int size) {
11786+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
11787+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11788+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
11789+    SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1);
11790+    SIMD_ST_F32(out + index, vout);
11791+  }
11792+  return index;
11793+}
11794+
11795+static inline int ElementOptDivNum1SSE(int index, const float *in0, const float *in1, float *out,
11796+                                                      int size) {
11797+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
11798+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11799+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
11800+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_);
11801+    SIMD_ST_F32(out + index, vout);
11802+  }
11803+  return index;
11804+}
11805+
11806+static inline int ElementOptDivIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
11807+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
11808+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11809+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
11810+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1);
11811+    SIMD_ST_EPI32(out + index, vout);
11812+  }
11813+  return index;
11814+}
11815+
11816+static inline int ElementOptDivIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
11817+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
11818+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11819+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
11820+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_);
11821+    SIMD_ST_EPI32(out + index, vout);
11822+  }
11823+  return index;
11824+}
11825+
11826+static inline int ElementOptDivReluNum0SSE(int index, const float *in0, const float *in1, float *out,
11827+                                                          int size) {
11828+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
11829+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11830+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
11831+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f);
11832+    SIMD_ST_F32(out + index, vout);
11833+  }
11834+  return index;
11835+}
11836+
11837+static inline int ElementOptDivReluNum1SSE(int index, const float *in0, const float *in1, float *out,
11838+                                                          int size) {
11839+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
11840+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11841+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
11842+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f);
11843+    SIMD_ST_F32(out + index, vout);
11844+  }
11845+  return index;
11846+}
11847+
11848+static inline int ElementOptDivRelu6Num0SSE(int index, const float *in0, const float *in1, float *out,
11849+                                                           int size) {
11850+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
11851+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11852+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
11853+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f);
11854+    SIMD_ST_F32(out + index, vout);
11855+  }
11856+  return index;
11857+}
11858+
11859+static inline int ElementOptDivRelu6Num1SSE(int index, const float *in0, const float *in1, float *out,
11860+                                                           int size) {
11861+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
11862+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11863+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
11864+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f);
11865+    SIMD_ST_F32(out + index, vout);
11866+  }
11867+  return index;
11868+}
11869+
11870+static inline int ElementDivSSE(int index, const float *in0, const float *in1, float *out, int size) {
11871+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11872+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
11873+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
11874+    SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1);
11875+    SIMD_ST_F32(out + index, vout);
11876+  }
11877+  return index;
11878+}
11879+
11880+static inline int ElementDivIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
11881+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11882+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
11883+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
11884+    SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1);
11885+    SIMD_ST_EPI32(out + index, vout);
11886+  }
11887+  return index;
11888+}
11889+
11890+static inline int ElementDivReluSSE(int index, const float *in0, const float *in1, float *out,
11891+                                                   int size) {
11892+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11893+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
11894+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
11895+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f);
11896+    SIMD_ST_F32(out + index, vout);
11897+  }
11898+  return index;
11899+}
11900+
11901+static inline int ElementDivRelu6SSE(int index, const float *in0, const float *in1, float *out,
11902+                                                    int size) {
11903+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11904+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
11905+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
11906+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f);
11907+    SIMD_ST_F32(out + index, vout);
11908+  }
11909+  return index;
11910+}
11911+
11912+#undef MS_SIMD_INSTRUCTION
11913+#undef BLOCK_NUM
11914+#pragma GCC pop_options
11915+#undef MS_SIMD_SSE
11916+#ifdef __cplusplus
11917+};
11918+#endif
11919+#endif
11920diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h
11921new file mode 100644
11922index 00000000..2429ed38
11923--- /dev/null
11924+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h
11925@@ -0,0 +1,46 @@
11926+/**
11927+ * Copyright 2022 Huawei Technologies Co., Ltd
11928+ *
11929+ * Licensed under the Apache License, Version 2.0 (the "License");
11930+ * you may not use this file except in compliance with the License.
11931+ * You may obtain a copy of the License at
11932+ *
11933+ * http://www.apache.org/licenses/LICENSE-2.0
11934+ *
11935+ * Unless required by applicable law or agreed to in writing, software
11936+ * distributed under the License is distributed on an "AS IS" BASIS,
11937+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11938+ * See the License for the specific language governing permissions and
11939+ * limitations under the License.
11940+ */
11941+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_SSE_H_
11942+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_SSE_H_
11943+
11944+#include "nnacl/intrinsics/ms_simd_instructions.h"
11945+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11946+
11947+#ifdef __cplusplus
11948+extern "C" {
11949+#endif
11950+#pragma GCC push_options
11951+#pragma GCC target("sse4.1")
11952+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
11953+#define BLOCK_NUM 4
11954+#define MS_SIMD_SSE
11955+
11956+static inline int DropoutFp32SSE(int index, const float *input, float scale,
11957+    int length, float *output) {
11958+    SIMD_F32 scale_value = SIMD_MOV_F32(scale);
11959+    for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
11960+        SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value));
11961+    }
11962+    return index;
11963+}
11964+#undef MS_SIMD_INSTRUCTION
11965+#undef BLOCK_NUM
11966+#pragma GCC pop_options
11967+#undef MS_SIMD_SSE
11968+#ifdef __cplusplus
11969+}
11970+#endif
11971+#endif
11972diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h
11973new file mode 100644
11974index 00000000..3d802fb3
11975--- /dev/null
11976+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h
11977@@ -0,0 +1,63 @@
11978+/**
11979+ * Copyright 2022 Huawei Technologies Co., Ltd
11980+ *
11981+ * Licensed under the Apache License, Version 2.0 (the "License");
11982+ * you may not use this file except in compliance with the License.
11983+ * You may obtain a copy of the License at
11984+ *
11985+ * http://www.apache.org/licenses/LICENSE-2.0
11986+ *
11987+ * Unless required by applicable law or agreed to in writing, software
11988+ * distributed under the License is distributed on an "AS IS" BASIS,
11989+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11990+ * See the License for the specific language governing permissions and
11991+ * limitations under the License.
11992+ */
11993+
11994+#ifndef MINDSPORE_NNACL_FP32_DIV_SSE_H_
11995+#define MINDSPORE_NNACL_FP32_DIV_SSE_H_
11996+
11997+#include "nnacl/intrinsics/ms_simd_instructions.h"
11998+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
11999+
12000+#ifdef __cplusplus
12001+extern "C" {
12002+#endif
12003+#pragma GCC push_options
12004+#pragma GCC target("sse4.1")
12005+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12006+#define BLOCK_NUM 4
12007+#define MS_SIMD_SSE
12008+
12009+static inline int64_t ExpFp32SSE(int64_t index, const float *src, float *dst, int num) {
12010+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12011+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
12012+  }
12013+  return index;
12014+}
12015+
12016+static inline int64_t ExpFp32WithInScaleSSE(int64_t index, const float *src, float *dst, int num, float in_scale) {
12017+  SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale);
12018+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12019+    SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index);
12020+  }
12021+  return index;
12022+}
12023+
12024+static inline int64_t ExpFp32WithOutScaleSSE(int64_t index, const float *src, float *dst, int num, float out_scale) {
12025+  SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale);
12026+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12027+    SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index);
12028+    SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec));
12029+  }
12030+  return index;
12031+}
12032+
12033+#undef MS_SIMD_INSTRUCTION
12034+#undef BLOCK_NUM
12035+#pragma GCC pop_options
12036+#undef MS_SIMD_SSE
12037+#ifdef __cplusplus
12038+};
12039+#endif
12040+#endif
12041diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h
12042new file mode 100644
12043index 00000000..9c71eefb
12044--- /dev/null
12045+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h
12046@@ -0,0 +1,53 @@
12047+/**
12048+ * Copyright 2022 Huawei Technologies Co., Ltd
12049+ *
12050+ * Licensed under the Apache License, Version 2.0 (the "License");
12051+ * you may not use this file except in compliance with the License.
12052+ * You may obtain a copy of the License at
12053+ *
12054+ * http://www.apache.org/licenses/LICENSE-2.0
12055+ *
12056+ * Unless required by applicable law or agreed to in writing, software
12057+ * distributed under the License is distributed on an "AS IS" BASIS,
12058+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12059+ * See the License for the specific language governing permissions and
12060+ * limitations under the License.
12061+ */
12062+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_SSE_H_
12063+#define MINDSPORE_NNACL_BASE_FILL_BASE_SSE_H_
12064+
12065+#include "nnacl/intrinsics/ms_simd_instructions.h"
12066+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
12067+
12068+#ifdef __cplusplus
12069+extern "C" {
12070+#endif
12071+#pragma GCC push_options
12072+#pragma GCC target("sse4.1")
12073+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12074+#define BLOCK_NUM 4
12075+#define MS_SIMD_SSE
12076+
12077+static inline int FillFp32SSE(int index, float *output, int size, float data) {
12078+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12079+    SIMD_ST_F32(output + index, SIMD_MOV_F32(data));
12080+  }
12081+  return index;
12082+}
12083+
12084+static inline int FillInt32SSE(int index, int *output, int size, int data) {
12085+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12086+    SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data));
12087+  }
12088+  return index;
12089+}
12090+
12091+#undef MS_SIMD_INSTRUCTION
12092+#undef BLOCK_NUM
12093+#pragma GCC pop_options
12094+#undef MS_SIMD_SSE
12095+#ifdef __cplusplus
12096+}
12097+#endif
12098+#endif
12099+
12100diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h
12101new file mode 100644
12102index 00000000..1c1f57da
12103--- /dev/null
12104+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h
12105@@ -0,0 +1,77 @@
12106+/**
12107+ * Copyright 2022 Huawei Technologies Co., Ltd
12108+ *
12109+ * Licensed under the Apache License, Version 2.0 (the "License");
12110+ * you may not use this file except in compliance with the License.
12111+ * You may obtain a copy of the License at
12112+ *
12113+ * http://www.apache.org/licenses/LICENSE-2.0
12114+ *
12115+ * Unless required by applicable law or agreed to in writing, software
12116+ * distributed under the License is distributed on an "AS IS" BASIS,
12117+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12118+ * See the License for the specific language governing permissions and
12119+ * limitations under the License.
12120+ */
12121+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_SSE_H_
12122+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_SSE_H_
12123+
12124+#include "nnacl/intrinsics/ms_simd_instructions.h"
12125+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
12126+
12127+#ifdef __cplusplus
12128+extern "C" {
12129+#endif
12130+#pragma GCC push_options
12131+#pragma GCC target("sse4.1")
12132+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12133+#define BLOCK_NUM 4
12134+#define MS_SIMD_SSE
12135+
12136+static inline int64_t GroupNormFp32SSE(int64_t index, const float *unit_input, float scale, float offset, float mean,
12137+  float var_sqrt, int unit, float *unit_output) {
12138+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
12139+  SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt);
12140+  SIMD_F32 scale_val = SIMD_MOV_F32(scale);
12141+  SIMD_F32 offset_val = SIMD_MOV_F32(offset);
12142+  for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12143+    SIMD_F32 input = SIMD_LD_F32(unit_input + index);
12144+    SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt);
12145+    SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val);
12146+    SIMD_ST_F32(unit_output + index, output);
12147+  }
12148+  return index;
12149+}
12150+
12151+static inline int64_t GroupNormReduceSumSSE(int64_t index, const float *in, float *sum, int unit) {
12152+  if (unit - index >= 4 * BLOCK_NUM) {
12153+    SIMD_F32 tmp = SIMD_MOV_F32(0);
12154+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12155+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index));
12156+    }
12157+    *sum += SIMD_GET_SUM_F32(tmp);
12158+  }
12159+  return index;
12160+}
12161+
12162+static inline int64_t GroupNormReduceVarSSE(int64_t index, const float *in, float mean, float *sum, int unit) {
12163+  if (unit - index >= 4 * BLOCK_NUM) {
12164+    SIMD_F32 mean_val = SIMD_MOV_F32(mean);
12165+    SIMD_F32 tmp = SIMD_MOV_F32(0);
12166+    for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12167+      SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val);
12168+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input));
12169+    }
12170+    *sum += SIMD_GET_SUM_F32(tmp);
12171+  }
12172+  return index;
12173+}
12174+
12175+#undef MS_SIMD_INSTRUCTION
12176+#undef BLOCK_NUM
12177+#pragma GCC pop_options
12178+#undef MS_SIMD_SSE
12179+#ifdef __cplusplus
12180+}
12181+#endif
12182+#endif
12183diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h
12184new file mode 100644
12185index 00000000..30af87c3
12186--- /dev/null
12187+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h
12188@@ -0,0 +1,68 @@
12189+/**
12190+ * Copyright 2022 Huawei Technologies Co., Ltd
12191+ *
12192+ * Licensed under the Apache License, Version 2.0 (the "License");
12193+ * you may not use this file except in compliance with the License.
12194+ * You may obtain a copy of the License at
12195+ *
12196+ * http://www.apache.org/licenses/LICENSE-2.0
12197+ *
12198+ * Unless required by applicable law or agreed to in writing, software
12199+ * distributed under the License is distributed on an "AS IS" BASIS,
12200+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12201+ * See the License for the specific language governing permissions and
12202+ * limitations under the License.
12203+ */
12204+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_SSE_H_
12205+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_SSE_H_
12206+
12207+#include "nnacl/intrinsics/ms_simd_instructions.h"
12208+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
12209+
12210+#ifdef __cplusplus
12211+extern "C" {
12212+#endif
12213+#pragma GCC push_options
12214+#pragma GCC target("sse4.1")
12215+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12216+#define BLOCK_NUM 4
12217+#define MS_SIMD_SSE
12218+
12219+static inline int LayerNormMeanAndSquareSSE(int index, const float *src, int num, float *mean, float *square_mean) {
12220+  if (num >= 4 * BLOCK_NUM) {
12221+    SIMD_F32 sum_val = SIMD_SET0_F32;
12222+    SIMD_F32 square_sum_val = SIMD_SET0_F32;
12223+    for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12224+      SIMD_F32 value = SIMD_LD_F32(src + index);
12225+      SIMD_F32 square_value = SIMD_MUL_F32(value, value);
12226+      sum_val = SIMD_ADD_F32(sum_val, value);
12227+      square_sum_val = SIMD_ADD_F32(square_sum_val, square_value);
12228+    }
12229+    *mean += SIMD_GET_SUM_F32(sum_val);
12230+    *square_mean += SIMD_GET_SUM_F32(square_sum_val);
12231+  }
12232+  return index;
12233+}
12234+
12235+static inline int LayerNormGammaAndBetaSSE(int index, float *dst, const float *src, const float *gamma_data,
12236+  const float *beta_data, int num, const float mean, const float deno) {
12237+  SIMD_F32 mean_val = SIMD_MOV_F32(mean);
12238+  SIMD_F32 deno_val = SIMD_MOV_F32(deno);
12239+  for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12240+    SIMD_F32 value = SIMD_LD_F32(src + index);
12241+    SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val);
12242+    out_value = SIMD_MUL_F32(out_value, deno_val);
12243+    out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index));
12244+    SIMD_ST_F32(dst + index, out_value);
12245+  }
12246+  return index;
12247+}
12248+
12249+#undef MS_SIMD_INSTRUCTION
12250+#undef BLOCK_NUM
12251+#pragma GCC pop_options
12252+#undef MS_SIMD_SSE
12253+#ifdef __cplusplus
12254+}
12255+#endif
12256+#endif
12257diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h
12258new file mode 100644
12259index 00000000..aef5b2a1
12260--- /dev/null
12261+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h
12262@@ -0,0 +1,93 @@
12263+/**
12264+ * Copyright 2022 Huawei Technologies Co., Ltd
12265+ *
12266+ * Licensed under the Apache License, Version 2.0 (the "License");
12267+ * you may not use this file except in compliance with the License.
12268+ * You may obtain a copy of the License at
12269+ *
12270+ * http://www.apache.org/licenses/LICENSE-2.0
12271+ *
12272+ * Unless required by applicable law or agreed to in writing, software
12273+ * distributed under the License is distributed on an "AS IS" BASIS,
12274+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12275+ * See the License for the specific language governing permissions and
12276+ * limitations under the License.
12277+ */
12278+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_SSE_H_
12279+#define MINDSPORE_NNACL_FP32_MATMUL_F32_SSE_H_
12280+
12281+#include "nnacl/intrinsics/ms_simd_instructions.h"
12282+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
12283+
12284+#ifdef __cplusplus
12285+extern "C" {
12286+#endif
12287+#pragma GCC push_options
12288+#pragma GCC target("sse4.1")
12289+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12290+#define BLOCK_NUM 4
12291+#define MS_SIMD_SSE
12292+
12293+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6.
12294+static inline int64_t GemmIsNotPackSSE(int64_t index, const float *a, const float *b, float *c, const float *bias, int row,
12295+  int deep, int act_type) {
12296+  SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f);
12297+  SIMD_F32 up_threshold = SIMD_MOV_F32(6);
12298+  SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]);
12299+  SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]);
12300+  for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12301+    SIMD_F32 a_data = SIMD_LD_F32(a + index);
12302+    SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16);
12303+    if (act_type != 0) {
12304+      dst = SIMD_MAX_F32(dst, down_threshold);
12305+      if (act_type == 3) {
12306+        dst = SIMD_MIN_F32(dst, up_threshold);
12307+      }
12308+    }
12309+    SIMD_ST_F32(c + index, dst);
12310+  }
12311+
12312+  return index;
12313+}
12314+
12315+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX)
12316+static inline int64_t GemmIsNotPackOptimizeCoreSSE(int64_t index, const float *a, const float *b, int k, float *dst) {
12317+  SIMD_F32 dst1 = SIMD_MOV_F32(0.0f);
12318+  for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12319+    SIMD_F32 weight = SIMD_LD_F32(b + index);
12320+    SIMD_F32 a1 = SIMD_LD_F32(a + index);
12321+    dst1 = SIMD_FMADD_F32(weight, a1, dst1);
12322+  }
12323+  *dst += SIMD_REDUCE_ADD_F32(dst1);
12324+  return index;
12325+}
12326+#endif
12327+
12328+static inline int64_t MatVecMulNoPackCoreSSE(int64_t oc_index, const float *a, const float *b, float *c, const float *bias,
12329+  int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) {
12330+  for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) {
12331+    SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index));
12332+    for (int64_t k = 0; k < depth; ++k) {
12333+      SIMD_F32 left = SIMD_MOV_F32(a[k]);
12334+      SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col);
12335+      out = SIMD_FMADD_F32(left, right, out);
12336+    }
12337+    if ((inc_flag & 0x2) != 0 && act_type != 0) {
12338+      out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f));
12339+      if (act_type == 0x3) {
12340+        out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f));
12341+      }
12342+    }
12343+    SIMD_ST_F32(c + oc_index, out);
12344+  }
12345+  return oc_index;
12346+}
12347+
12348+#undef MS_SIMD_INSTRUCTION
12349+#undef BLOCK_NUM
12350+#pragma GCC pop_options
12351+#undef MS_SIMD_SSE
12352+#ifdef __cplusplus
12353+}
12354+#endif
12355+#endif
12356diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h
12357new file mode 100644
12358index 00000000..e3dd4582
12359--- /dev/null
12360+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h
12361@@ -0,0 +1,218 @@
12362+/**
12363+ * Copyright 2022 Huawei Technologies Co., Ltd
12364+ *
12365+ * Licensed under the Apache License, Version 2.0 (the "License");
12366+ * you may not use this file except in compliance with the License.
12367+ * You may obtain a copy of the License at
12368+ *
12369+ * http://www.apache.org/licenses/LICENSE-2.0
12370+ *
12371+ * Unless required by applicable law or agreed to in writing, software
12372+ * distributed under the License is distributed on an "AS IS" BASIS,
12373+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12374+ * See the License for the specific language governing permissions and
12375+ * limitations under the License.
12376+ */
12377+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
12378+#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_
12379+
12380+#include "nnacl/intrinsics/ms_simd_instructions.h"
12381+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
12382+
12383+#ifdef __cplusplus
12384+extern "C" {
12385+#endif
12386+#pragma GCC push_options
12387+#pragma GCC target("sse4.1")
12388+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12389+#define BLOCK_NUM 4
12390+#define MS_SIMD_SSE
12391+
12392+static inline int ElementMulSSE(int index, const float *in0, const float *in1, float *out, int size) {
12393+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12394+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
12395+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
12396+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1);
12397+    SIMD_ST_F32(out + index, vout);
12398+  }
12399+  return index;
12400+}
12401+
12402+static inline int ElementMulReluSSE(int index, const float *in0, const float *in1, float *out, int size) {
12403+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12404+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
12405+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
12406+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f);
12407+    SIMD_ST_F32(out + index, vout);
12408+  }
12409+  return index;
12410+}
12411+
12412+static inline int ElementMulRelu6SSE(int index, const float *in0, const float *in1, float *out, int size) {
12413+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12414+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
12415+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
12416+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f);
12417+    SIMD_ST_F32(out + index, vout);
12418+  }
12419+  return index;
12420+}
12421+
12422+static inline int ElementMulIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
12423+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12424+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
12425+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
12426+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1);
12427+    SIMD_ST_EPI32(out + index, vout);
12428+  }
12429+  return index;
12430+}
12431+
12432+static inline int ElementMulReluIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
12433+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12434+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
12435+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
12436+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f);
12437+    SIMD_ST_EPI32(out + index, vout);
12438+  }
12439+  return index;
12440+}
12441+
12442+static inline int ElementMulRelu6IntSSE(int index, const int *in0, const int *in1, int *out, int size) {
12443+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12444+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
12445+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
12446+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f);
12447+    SIMD_ST_EPI32(out + index, vout);
12448+  }
12449+  return index;
12450+}
12451+
12452+static inline int ElementOptMulNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
12453+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
12454+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12455+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
12456+    SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1);
12457+    SIMD_ST_F32(out + index, vout);
12458+  }
12459+  return index;
12460+}
12461+
12462+static inline int ElementOptMulNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
12463+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
12464+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12465+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
12466+    SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_);
12467+    SIMD_ST_F32(out + index, vout);
12468+  }
12469+  return index;
12470+}
12471+
12472+static inline int ElementOptMulReluNum0SSE(int index, const float *in0, const float *in1, float *out, int size) {
12473+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
12474+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12475+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
12476+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f);
12477+    SIMD_ST_F32(out + index, vout);
12478+  }
12479+  return index;
12480+}
12481+
12482+static inline int ElementOptMulReluNum1SSE(int index, const float *in0, const float *in1, float *out, int size) {
12483+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
12484+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12485+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
12486+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f);
12487+    SIMD_ST_F32(out + index, vout);
12488+  }
12489+  return index;
12490+}
12491+
12492+static inline int ElementOptMulRelu6Num0SSE(int index, const float *in0, const float *in1, float *out, int size) {
12493+  SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]);
12494+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12495+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
12496+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f);
12497+    SIMD_ST_F32(out + index, vout);
12498+  }
12499+  return index;
12500+}
12501+
12502+static inline int ElementOptMulRelu6Num1SSE(int index, const float *in0, const float *in1, float *out, int size) {
12503+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
12504+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12505+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
12506+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f);
12507+    SIMD_ST_F32(out + index, vout);
12508+  }
12509+  return index;
12510+}
12511+
12512+static inline int ElementOptMulIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
12513+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
12514+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12515+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
12516+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1);
12517+    SIMD_ST_EPI32(out + index, vout);
12518+  }
12519+  return index;
12520+}
12521+
12522+static inline int ElementOptMulIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
12523+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
12524+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12525+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
12526+    SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_);
12527+    SIMD_ST_EPI32(out + index, vout);
12528+  }
12529+  return index;
12530+}
12531+
12532+static inline int ElementOptMulReluIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
12533+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
12534+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12535+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
12536+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f);
12537+    SIMD_ST_EPI32(out + index, vout);
12538+  }
12539+  return index;
12540+}
12541+
12542+static inline int ElementOptMulReluIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
12543+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
12544+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12545+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
12546+    SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f);
12547+    SIMD_ST_EPI32(out + index, vout);
12548+  }
12549+  return index;
12550+}
12551+
12552+static inline int ElementOptMulRelu6IntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
12553+  SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]);
12554+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12555+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
12556+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f);
12557+    SIMD_ST_EPI32(out + index, vout);
12558+  }
12559+  return index;
12560+}
12561+
12562+static inline int ElementOptMulRelu6IntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
12563+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
12564+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12565+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
12566+    SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f);
12567+    SIMD_ST_EPI32(out + index, vout);
12568+  }
12569+  return index;
12570+}
12571+
12572+#undef MS_SIMD_INSTRUCTION
12573+#undef BLOCK_NUM
12574+#pragma GCC pop_options
12575+#undef MS_SIMD_SSE
12576+#ifdef __cplusplus
12577+}
12578+#endif
12579+#endif
12580diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h
12581new file mode 100644
12582index 00000000..ad9239fd
12583--- /dev/null
12584+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h
12585@@ -0,0 +1,84 @@
12586+/**
12587+ * Copyright 2022 Huawei Technologies Co., Ltd
12588+ *
12589+ * Licensed under the Apache License, Version 2.0 (the "License");
12590+ * you may not use this file except in compliance with the License.
12591+ * You may obtain a copy of the License at
12592+ *
12593+ * http://www.apache.org/licenses/LICENSE-2.0
12594+ *
12595+ * Unless required by applicable law or agreed to in writing, software
12596+ * distributed under the License is distributed on an "AS IS" BASIS,
12597+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12598+ * See the License for the specific language governing permissions and
12599+ * limitations under the License.
12600+ */
12601+#ifndef MINDSPORE_NNACL_FP32_POOLING_SSE_H_
12602+#define MINDSPORE_NNACL_FP32_POOLING_SSE_H_
12603+
12604+#include "nnacl/intrinsics/ms_simd_instructions.h"
12605+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
12606+
12607+#ifdef __cplusplus
12608+extern "C" {
12609+#endif
12610+#pragma GCC push_options
12611+#pragma GCC target("sse4.1")
12612+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12613+#define BLOCK_NUM 4
12614+#define MS_SIMD_SSE
12615+
12616+static inline int AvgPoolingBatchSSE(int ci, const float *src_plane_ptr, int channel,
12617+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
12618+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
12619+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
12620+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
12621+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
12622+    const float *src_c_ptr = src_plane_ptr + ci;
12623+    float *dst_c_ptr = dst_plane_ptr + ci;
12624+    SIMD_F32 tmp_avg = SIMD_SET0_F32;
12625+    int real_count = 0;
12626+    for (int h = real_win_h_start; h < real_win_h_end; h++) {
12627+      for (int w = real_win_w_start; w < real_win_w_end; w++) {
12628+        const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel;
12629+        tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr));
12630+        ++real_count;
12631+      }
12632+    }
12633+    tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count));
12634+    tmp_avg = SIMD_MAX_F32(tmp_avg, min_val);
12635+    tmp_avg = SIMD_MIN_F32(tmp_avg, max_val);
12636+    SIMD_ST_F32(dst_c_ptr, tmp_avg);
12637+  }
12638+  return ci;
12639+}
12640+
12641+static inline int MaxPoolingBatchSSE(int ci, const float *src_plane_ptr, int channel,
12642+  float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end,
12643+  int in_h_index, int in_w, int in_w_index, float minf, float maxf) {
12644+  SIMD_F32 min_val = SIMD_MOV_F32(minf);
12645+  SIMD_F32 max_val = SIMD_MOV_F32(maxf);
12646+  for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) {
12647+    const float *src_c_ptr = src_plane_ptr + ci;
12648+    float *dst_c_ptr = dst_plane_ptr + ci;
12649+    SIMD_F32 tmp_max = min_val;
12650+    for (int kh = real_win_h_start; kh < real_win_h_end; kh++) {
12651+      for (int kw = real_win_w_start; kw < real_win_w_end; kw++) {
12652+        const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel;
12653+        tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr));
12654+      }
12655+    }
12656+    tmp_max = SIMD_MIN_F32(tmp_max, max_val);
12657+    SIMD_ST_F32(dst_c_ptr, tmp_max);
12658+  }
12659+  return ci;
12660+}
12661+
12662+#undef MS_SIMD_INSTRUCTION
12663+#undef BLOCK_NUM
12664+#pragma GCC pop_options
12665+#undef MS_SIMD_SSE
12666+#ifdef __cplusplus
12667+}
12668+#endif
12669+#endif
12670diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h
12671new file mode 100644
12672index 00000000..4c46310e
12673--- /dev/null
12674+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h
12675@@ -0,0 +1,101 @@
12676+/**
12677+ * Copyright 2022 Huawei Technologies Co., Ltd
12678+ *
12679+ * Licensed under the Apache License, Version 2.0 (the "License");
12680+ * you may not use this file except in compliance with the License.
12681+ * You may obtain a copy of the License at
12682+ *
12683+ * http://www.apache.org/licenses/LICENSE-2.0
12684+ *
12685+ * Unless required by applicable law or agreed to in writing, software
12686+ * distributed under the License is distributed on an "AS IS" BASIS,
12687+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12688+ * See the License for the specific language governing permissions and
12689+ * limitations under the License.
12690+ */
12691+#ifndef MINDSPORE_NNACL_FP32_POWER_SSE_H_
12692+#define MINDSPORE_NNACL_FP32_POWER_SSE_H_
12693+
12694+#include "nnacl/intrinsics/ms_simd_instructions.h"
12695+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
12696+
12697+#ifdef __cplusplus
12698+extern "C" {
12699+#endif
12700+#pragma GCC push_options
12701+#pragma GCC target("sse4.1")
12702+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12703+#define BLOCK_NUM 4
12704+#define MS_SIMD_SSE
12705+
12706+static inline int PowerBroadCastIntExponentSSE(int index, const float *input, int exponent, float *output, int len,
12707+  float scale, float shift) {
12708+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
12709+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
12710+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12711+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
12712+    SIMD_F32 result = SIMD_MOV_F32(1.0f);
12713+    int exp = abs(exponent);
12714+    while (exp) {
12715+      if (exp % 2) {
12716+        result = SIMD_MUL_F32(result, tmp);
12717+      }
12718+      tmp = SIMD_MUL_SQUARE_F32(tmp);
12719+      exp = exp / 2;
12720+    }
12721+    SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result));
12722+  }
12723+  return index;
12724+}
12725+
12726+static inline int PowerBroadCastFloatExponentSSE(int index, const float *input, float exponent, float *output, int len,
12727+  float scale, float shift) {
12728+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
12729+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
12730+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12731+    SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
12732+    SIMD_F32 result;
12733+    for (int i = 0; i < BLOCK_NUM; ++i) {
12734+      SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent);
12735+    }
12736+    SIMD_ST_F32(output + index, result);
12737+  }
12738+  return index;
12739+}
12740+
12741+static inline int PowerSingleExponentSSE(int index, const float *input, const float *exponent, float *output, int len,
12742+  float scale, float shift) {
12743+  SIMD_F32 scale_vec = SIMD_MOV_F32(scale);
12744+  SIMD_F32 shift_vec = SIMD_MOV_F32(shift);
12745+  for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12746+    SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec);
12747+    for (int j = 0; j < BLOCK_NUM; ++j) {
12748+      float cur_exponent = exponent[index + j];
12749+      float cur_val = SIMD_F32_GETI(tmp_vec, j);
12750+      if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) {
12751+        int exp = abs((int)(cur_exponent));
12752+        float result = 1;
12753+        while (exp) {
12754+          if (exp % 2) {
12755+            result *= cur_val;
12756+          }
12757+          cur_val *= cur_val;
12758+          exp = exp / 2;
12759+        }
12760+        output[index + j] = *exponent >= 0 ? result : 1 / result;
12761+      } else {
12762+        output[index + j] = powf(cur_val, cur_exponent);
12763+      }
12764+    }
12765+  }
12766+  return index;
12767+}
12768+
12769+#undef MS_SIMD_INSTRUCTION
12770+#undef BLOCK_NUM
12771+#pragma GCC pop_options
12772+#undef MS_SIMD_SSE
12773+#ifdef __cplusplus
12774+}
12775+#endif
12776+#endif
12777diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h
12778new file mode 100644
12779index 00000000..936a5d51
12780--- /dev/null
12781+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h
12782@@ -0,0 +1,181 @@
12783+/**
12784+ * Copyright 2022 Huawei Technologies Co., Ltd
12785+ *
12786+ * Licensed under the Apache License, Version 2.0 (the "License");
12787+ * you may not use this file except in compliance with the License.
12788+ * You may obtain a copy of the License at
12789+ *
12790+ * http://www.apache.org/licenses/LICENSE-2.0
12791+ *
12792+ * Unless required by applicable law or agreed to in writing, software
12793+ * distributed under the License is distributed on an "AS IS" BASIS,
12794+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12795+ * See the License for the specific language governing permissions and
12796+ * limitations under the License.
12797+ */
12798+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_SSE_H_
12799+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_SSE_H_
12800+
12801+#include "nnacl/intrinsics/ms_simd_instructions.h"
12802+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
12803+
12804+#ifdef __cplusplus
12805+extern "C" {
12806+#endif
12807+#pragma GCC push_options
12808+#pragma GCC target("sse4.1")
12809+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12810+#define BLOCK_NUM 4
12811+#define MS_SIMD_SSE
12812+
12813+static inline int64_t ReduceSumSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
12814+  int axis_size) {
12815+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12816+    const float *inner_src = outer_src + index;
12817+    SIMD_F32 tmp = SIMD_MOV_F32(0);
12818+    for (int i = 0; i < axis_size; i++) {
12819+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
12820+    }
12821+    SIMD_ST_F32(outer_dst + index, tmp);
12822+  }
12823+  return index;
12824+}
12825+
12826+static inline int64_t ReduceMeanSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
12827+  int axis_size) {
12828+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12829+    const float *inner_src = outer_src + index;
12830+    SIMD_F32 tmp = SIMD_MOV_F32(0);
12831+    for (int i = 0; i < axis_size; i++) {
12832+      tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
12833+    }
12834+    SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size));
12835+  }
12836+  return index;
12837+}
12838+
12839+static inline int64_t ReduceMinSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
12840+  int axis_size) {
12841+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12842+    const float *inner_src = outer_src + index;
12843+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX);
12844+    for (int i = 0; i < axis_size; i++) {
12845+      tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
12846+    }
12847+    SIMD_ST_F32(outer_dst + index, tmp);
12848+  }
12849+  return index;
12850+}
12851+
12852+static inline int64_t ReduceMaxSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
12853+  int axis_size) {
12854+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12855+    const float *inner_src = outer_src + index;
12856+    SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN);
12857+    for (int i = 0; i < axis_size; i++) {
12858+      tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
12859+    }
12860+    SIMD_ST_F32(outer_dst + index, tmp);
12861+  }
12862+  return index;
12863+}
12864+
12865+static inline int64_t ReduceProdSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
12866+  int axis_size) {
12867+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12868+    const float *inner_src = outer_src + index;
12869+    SIMD_F32 tmp = SIMD_MOV_F32(1.0f);
12870+    for (int i = 0; i < axis_size; i++) {
12871+      tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size));
12872+    }
12873+    SIMD_ST_F32(outer_dst + index, tmp);
12874+  }
12875+  return index;
12876+}
12877+
12878+static inline int64_t ReduceSumSquareSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
12879+  int axis_size) {
12880+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12881+    const float *inner_src = outer_src + index;
12882+    SIMD_F32 tmp = SIMD_MOV_F32(0);
12883+    for (int i = 0; i < axis_size; i++) {
12884+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
12885+    }
12886+    SIMD_ST_F32(outer_dst + index, tmp);
12887+  }
12888+  return index;
12889+}
12890+
12891+static inline int64_t ReduceL2NormSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size,
12892+  int axis_size) {
12893+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12894+    const float *inner_src = outer_src + index;
12895+    SIMD_F32 tmp = SIMD_MOV_F32(0);
12896+    for (int i = 0; i < axis_size; i++) {
12897+      tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size)));
12898+    }
12899+    SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp));
12900+  }
12901+  return index;
12902+}
12903+
12904+static inline int64_t IntReduceSumSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
12905+  int axis_size) {
12906+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12907+    const int *inner_src = outer_src + index;
12908+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
12909+    for (int i = 0; i < axis_size; i++) {
12910+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
12911+    }
12912+    SIMD_ST_EPI32(outer_dst + index, tmp);
12913+  }
12914+  return index;
12915+}
12916+
12917+static inline int64_t IntReduceMeanSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
12918+  int axis_size) {
12919+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12920+    const int *inner_src = outer_src + index;
12921+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(0);
12922+    for (int i = 0; i < axis_size; i++) {
12923+      tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
12924+    }
12925+    SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size));
12926+  }
12927+  return index;
12928+}
12929+
12930+static inline int64_t IntReduceMinSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
12931+  int axis_size) {
12932+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12933+    const int *inner_src = outer_src + index;
12934+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX);
12935+    for (int i = 0; i < axis_size; i++) {
12936+      tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
12937+    }
12938+    SIMD_ST_EPI32(outer_dst + index, tmp);
12939+  }
12940+  return index;
12941+}
12942+
12943+static inline int64_t IntReduceMaxSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size,
12944+  int axis_size) {
12945+  for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
12946+    const int *inner_src = outer_src + index;
12947+    SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN);
12948+    for (int i = 0; i < axis_size; i++) {
12949+      tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size));
12950+    }
12951+    SIMD_ST_EPI32(outer_dst + index, tmp);
12952+  }
12953+  return index;
12954+}
12955+
12956+#undef MS_SIMD_INSTRUCTION
12957+#undef BLOCK_NUM
12958+#pragma GCC pop_options
12959+#undef MS_SIMD_SSE
12960+#ifdef __cplusplus
12961+}
12962+#endif
12963+#endif
12964diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h
12965new file mode 100644
12966index 00000000..71c89ebc
12967--- /dev/null
12968+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h
12969@@ -0,0 +1,87 @@
12970+/**
12971+ * Copyright 2022 Huawei Technologies Co., Ltd
12972+ *
12973+ * Licensed under the Apache License, Version 2.0 (the "License");
12974+ * you may not use this file except in compliance with the License.
12975+ * You may obtain a copy of the License at
12976+ *
12977+ * http://www.apache.org/licenses/LICENSE-2.0
12978+ *
12979+ * Unless required by applicable law or agreed to in writing, software
12980+ * distributed under the License is distributed on an "AS IS" BASIS,
12981+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12982+ * See the License for the specific language governing permissions and
12983+ * limitations under the License.
12984+ */
12985+
12986+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_SSE_H_
12987+#define MINDSPORE_NNACL_FP32_SOFTMAX_SSE_H_
12988+
12989+#include "nnacl/intrinsics/ms_simd_instructions.h"
12990+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
12991+
12992+#ifdef __cplusplus
12993+extern "C" {
12994+#endif
12995+#pragma GCC push_options
12996+#pragma GCC target("sse4.1")
12997+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
12998+#define BLOCK_NUM 4
12999+#define MS_SIMD_SSE
13000+
13001+static inline int64_t SoftmaxNormGetMaxSSE(int64_t index, const float *src, int cur_batch_offset,
13002+  float *max, int channel) {
13003+  if (channel >= BLOCK_NUM * BLOCK_NUM) {
13004+    SIMD_F32 max_val = SIMD_MOV_F32(*max);
13005+    for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13006+      max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index));
13007+    }
13008+    *max = SIMD_GET_MAX_F32(max_val);
13009+  }
13010+  return index;
13011+}
13012+
13013+static inline int64_t SoftmaxNormCalcNormSSE(int64_t index, const float *src, float *dst,
13014+  int cur_batch_offset, float max, int channel) {
13015+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13016+    SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max));
13017+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
13018+  }
13019+  return index;
13020+}
13021+
13022+static inline int64_t SoftmaxLastAxisGetExpSumSSE(int64_t index, const float *src, float *dst,
13023+  int cur_batch_offset, float max, float *exp_sum, int channel) {
13024+#ifndef _WIN32
13025+  SIMD_F32 sum_val = SIMD_SET0_F32;
13026+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13027+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
13028+    SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max));
13029+    SIMD_F32 exp_out = SIMD_EXP_F32(output);
13030+    sum_val = SIMD_ADD_F32(sum_val, exp_out);
13031+    SIMD_ST_F32(dst + cur_batch_offset + index, exp_out);
13032+  }
13033+  *exp_sum += SIMD_GET_SUM_F32(sum_val);
13034+#endif
13035+  return index;
13036+}
13037+
13038+static inline int64_t SoftmaxLastAxisGetResultSSE(int64_t index, const float *src, float *dst,
13039+  int cur_batch_offset, float exp_sum, int channel) {
13040+  SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum);
13041+  for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13042+    SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index);
13043+    SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val);
13044+    SIMD_ST_F32(dst + cur_batch_offset + index, output);
13045+  }
13046+  return index;
13047+}
13048+
13049+#undef MS_SIMD_INSTRUCTION
13050+#undef BLOCK_NUM
13051+#pragma GCC pop_options
13052+#undef MS_SIMD_SSE
13053+#ifdef __cplusplus
13054+};
13055+#endif
13056+#endif
13057diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h
13058new file mode 100644
13059index 00000000..a6197e19
13060--- /dev/null
13061+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h
13062@@ -0,0 +1,167 @@
13063+/**
13064+ * Copyright 2022 Huawei Technologies Co., Ltd
13065+ *
13066+ * Licensed under the Apache License, Version 2.0 (the "License");
13067+ * you may not use this file except in compliance with the License.
13068+ * You may obtain a copy of the License at
13069+ *
13070+ * http://www.apache.org/licenses/LICENSE-2.0
13071+ *
13072+ * Unless required by applicable law or agreed to in writing, software
13073+ * distributed under the License is distributed on an "AS IS" BASIS,
13074+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13075+ * See the License for the specific language governing permissions and
13076+ * limitations under the License.
13077+ */
13078+
13079+#ifndef MINDSPORE_NNACL_FP32_SUB_SSE_H_
13080+#define MINDSPORE_NNACL_FP32_SUB_SSE_H_
13081+
13082+#include "nnacl/intrinsics/ms_simd_instructions.h"
13083+#include "nnacl/intrinsics/ms_simd_sse_instructions.h"
13084+
13085+#ifdef __cplusplus
13086+extern "C" {
13087+#endif
13088+#pragma GCC push_options
13089+#pragma GCC target("sse4.1")
13090+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION
13091+#define BLOCK_NUM 4
13092+#define MS_SIMD_SSE
13093+
13094+static inline int ElementOptSubNum0SSE(int index, const float *in0, const float *in1, float *out,
13095+                                                      int size) {
13096+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
13097+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13098+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
13099+    SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1);
13100+    SIMD_ST_F32(out + index, vout);
13101+  }
13102+  return index;
13103+}
13104+
13105+static inline int ElementOptSubNum1SSE(int index, const float *in0, const float *in1, float *out,
13106+                                                      int size) {
13107+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
13108+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13109+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
13110+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_);
13111+    SIMD_ST_F32(out + index, vout);
13112+  }
13113+  return index;
13114+}
13115+
13116+static inline int ElementOptSubIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) {
13117+  SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]);
13118+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13119+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
13120+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1);
13121+    SIMD_ST_EPI32(out + index, vout);
13122+  }
13123+  return index;
13124+}
13125+
13126+static inline int ElementOptSubIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) {
13127+  SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]);
13128+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13129+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
13130+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_);
13131+    SIMD_ST_EPI32(out + index, vout);
13132+  }
13133+  return index;
13134+}
13135+
13136+static inline int ElementOptSubReluNum0SSE(int index, const float *in0, const float *in1, float *out,
13137+                                                          int size) {
13138+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
13139+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13140+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
13141+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f);
13142+    SIMD_ST_F32(out + index, vout);
13143+  }
13144+  return index;
13145+}
13146+
13147+static inline int ElementOptSubReluNum1SSE(int index, const float *in0, const float *in1, float *out,
13148+                                                          int size) {
13149+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
13150+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13151+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
13152+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f);
13153+    SIMD_ST_F32(out + index, vout);
13154+  }
13155+  return index;
13156+}
13157+
13158+static inline int ElementOptSubRelu6Num0SSE(int index, const float *in0, const float *in1, float *out,
13159+                                                           int size) {
13160+  SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]);
13161+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13162+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
13163+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f);
13164+    SIMD_ST_F32(out + index, vout);
13165+  }
13166+  return index;
13167+}
13168+
13169+static inline int ElementOptSubRelu6Num1SSE(int index, const float *in0, const float *in1, float *out,
13170+                                                           int size) {
13171+  SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]);
13172+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13173+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
13174+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f);
13175+    SIMD_ST_F32(out + index, vout);
13176+  }
13177+  return index;
13178+}
13179+
13180+static inline int ElementSubSSE(int index, const float *in0, const float *in1, float *out, int size) {
13181+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13182+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
13183+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
13184+    SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1);
13185+    SIMD_ST_F32(out + index, vout);
13186+  }
13187+  return index;
13188+}
13189+
13190+static inline int ElementSubIntSSE(int index, const int *in0, const int *in1, int *out, int size) {
13191+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13192+    SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index);
13193+    SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index);
13194+    SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1);
13195+    SIMD_ST_EPI32(out + index, vout);
13196+  }
13197+  return index;
13198+}
13199+
13200+static inline int ElementSubReluSSE(int index, const float *in0, const float *in1, float *out,
13201+                                                   int size) {
13202+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13203+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
13204+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
13205+    SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f);
13206+    SIMD_ST_F32(out + index, vout);
13207+  }
13208+  return index;
13209+}
13210+
13211+static inline int ElementSubRelu6SSE(int index, const float *in0, const float *in1, float *out,
13212+                                                    int size) {
13213+  for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) {
13214+    SIMD_F32 vin0 = SIMD_LD_F32(in0 + index);
13215+    SIMD_F32 vin1 = SIMD_LD_F32(in1 + index);
13216+    SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f);
13217+    SIMD_ST_F32(out + index, vout);
13218+  }
13219+  return index;
13220+}
13221+
13222+#undef MS_SIMD_INSTRUCTION
13223+#undef BLOCK_NUM
13224+#pragma GCC pop_options
13225+#undef MS_SIMD_SSE
13226+#ifdef __cplusplus
13227+};
13228+#endif
13229+#endif
13230diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h
13231new file mode 100644
13232index 00000000..894f5d7c
13233--- /dev/null
13234+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h
13235@@ -0,0 +1,36 @@
13236+/**
13237+ * Copyright 2022 Huawei Technologies Co., Ltd
13238+ *
13239+ * Licensed under the Apache License, Version 2.0 (the "License");
13240+ * you may not use this file except in compliance with the License.
13241+ * You may obtain a copy of the License at
13242+ *
13243+ * http://www.apache.org/licenses/LICENSE-2.0
13244+ *
13245+ * Unless required by applicable law or agreed to in writing, software
13246+ * distributed under the License is distributed on an "AS IS" BASIS,
13247+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13248+ * See the License for the specific language governing permissions and
13249+ * limitations under the License.
13250+ */
13251+#ifndef MINDSPORE_NNACL_SUB_FP32_SIMD_H_
13252+#define MINDSPORE_NNACL_SUB_FP32_SIMD_H_
13253+
13254+#include "nnacl/intrinsics/ms_simd_instructions.h"
13255+#ifdef ENABLE_AVX512
13256+#include "nnacl/avx512/sub_fp32_avx512.h"
13257+#endif
13258+
13259+#ifdef ENABLE_AVX
13260+#include "nnacl/avx/sub_fp32_avx.h"
13261+#endif
13262+
13263+#ifdef ENABLE_SSE
13264+#include "nnacl/sse/sub_fp32_sse.h"
13265+#endif
13266+
13267+#ifdef ENABLE_ARM
13268+#include "nnacl/neon/sub_fp32_neon.h"
13269+#endif
13270+
13271+#endif
13272--
132732.34.1
13274
13275