1From e6e35ad9f7b4c0c99d2f9b62c7d199dd3bf487dc Mon Sep 17 00:00:00 2001 2From: Zhu Guodong <zhuguodong0001@163.com> 3Date: Mon, 6 Mar 2023 16:02:57 +0800 4Subject: [PATCH 2/4] generate nnacl simd headers manually 5 6--- 7 .../include/nnacl/activation_fp32_simd.h | 36 +++ 8 .../include/nnacl/activation_grad_simd.h | 36 +++ 9 .../nnacl/include/nnacl/adam_fp32_simd.h | 36 +++ 10 .../nnacl/include/nnacl/add_fp32_simd.h | 36 +++ 11 .../include/nnacl/arithmetic_fp32_simd.h | 36 +++ 12 .../include/nnacl/arithmetic_self_fp32_simd.h | 36 +++ 13 .../include/nnacl/avx/activation_fp32_avx.h | 221 +++++++++++++++ 14 .../include/nnacl/avx/activation_grad_avx.h | 57 ++++ 15 .../nnacl/include/nnacl/avx/adam_fp32_avx.h | 210 +++++++++++++++ 16 .../nnacl/include/nnacl/avx/add_fp32_avx.h | 124 +++++++++ 17 .../include/nnacl/avx/arithmetic_fp32_avx.h | 254 ++++++++++++++++++ 18 .../nnacl/avx/arithmetic_self_fp32_avx.h | 129 +++++++++ 19 .../include/nnacl/avx/batchnorm_fp32_avx.h | 67 +++++ 20 .../nnacl/avx/bce_with_logits_loss_fp32_avx.h | 69 +++++ 21 .../nnacl/include/nnacl/avx/bias_add_avx.h | 64 +++++ 22 .../nnacl/include/nnacl/avx/cast_base_avx.h | 56 ++++ 23 .../nnacl/include/nnacl/avx/cdist_fp32_avx.h | 70 +++++ 24 .../nnacl/include/nnacl/avx/cumsum_fp32_avx.h | 121 +++++++++ 25 .../nnacl/include/nnacl/avx/div_fp32_avx.h | 167 ++++++++++++ 26 .../include/nnacl/avx/dropout_fp32_avx.h | 46 ++++ 27 .../nnacl/include/nnacl/avx/exp_fp32_avx.h | 63 +++++ 28 .../nnacl/include/nnacl/avx/fill_base_avx.h | 53 ++++ 29 .../include/nnacl/avx/group_norm_fp32_avx.h | 77 ++++++ 30 .../include/nnacl/avx/layer_norm_fp32_avx.h | 68 +++++ 31 .../nnacl/include/nnacl/avx/matmul_fp32_avx.h | 93 +++++++ 32 .../nnacl/include/nnacl/avx/mul_fp32_avx.h | 218 +++++++++++++++ 33 .../include/nnacl/avx/pooling_fp32_avx.h | 84 ++++++ 34 .../nnacl/include/nnacl/avx/power_fp32_avx.h | 101 +++++++ 35 .../nnacl/include/nnacl/avx/reduce_fp32_avx.h | 181 +++++++++++++ 36 .../include/nnacl/avx/softmax_fp32_avx.h | 87 ++++++ 37 .../nnacl/include/nnacl/avx/sub_fp32_avx.h | 167 ++++++++++++ 38 .../nnacl/avx512/activation_fp32_avx512.h | 221 +++++++++++++++ 39 .../nnacl/avx512/activation_grad_avx512.h | 57 ++++ 40 .../include/nnacl/avx512/adam_fp32_avx512.h | 210 +++++++++++++++ 41 .../include/nnacl/avx512/add_fp32_avx512.h | 124 +++++++++ 42 .../nnacl/avx512/arithmetic_fp32_avx512.h | 254 ++++++++++++++++++ 43 .../avx512/arithmetic_self_fp32_avx512.h | 129 +++++++++ 44 .../nnacl/avx512/batchnorm_fp32_avx512.h | 67 +++++ 45 .../avx512/bce_with_logits_loss_fp32_avx512.h | 69 +++++ 46 .../include/nnacl/avx512/bias_add_avx512.h | 64 +++++ 47 .../include/nnacl/avx512/cast_base_avx512.h | 56 ++++ 48 .../include/nnacl/avx512/cdist_fp32_avx512.h | 70 +++++ 49 .../include/nnacl/avx512/cumsum_fp32_avx512.h | 121 +++++++++ 50 .../include/nnacl/avx512/div_fp32_avx512.h | 167 ++++++++++++ 51 .../nnacl/avx512/dropout_fp32_avx512.h | 46 ++++ 52 .../include/nnacl/avx512/exp_fp32_avx512.h | 63 +++++ 53 .../include/nnacl/avx512/fill_base_avx512.h | 53 ++++ 54 .../nnacl/avx512/group_norm_fp32_avx512.h | 77 ++++++ 55 .../nnacl/avx512/layer_norm_fp32_avx512.h | 68 +++++ 56 .../include/nnacl/avx512/matmul_fp32_avx512.h | 93 +++++++ 57 .../include/nnacl/avx512/mul_fp32_avx512.h | 218 +++++++++++++++ 58 .../nnacl/avx512/pooling_fp32_avx512.h | 84 ++++++ 59 .../include/nnacl/avx512/power_fp32_avx512.h | 101 +++++++ 60 .../include/nnacl/avx512/reduce_fp32_avx512.h | 181 +++++++++++++ 61 .../nnacl/avx512/softmax_fp32_avx512.h | 87 ++++++ 62 .../include/nnacl/avx512/sub_fp32_avx512.h | 167 ++++++++++++ 63 .../nnacl/include/nnacl/batchnorm_fp32_simd.h | 36 +++ 64 .../nnacl/bce_with_logits_loss_fp32_simd.h | 36 +++ 65 .../nnacl/include/nnacl/bias_add_simd.h | 36 +++ 66 .../nnacl/include/nnacl/cast_base_simd.h | 36 +++ 67 .../nnacl/include/nnacl/cdist_fp32_simd.h | 36 +++ 68 .../nnacl/include/nnacl/cumsum_fp32_simd.h | 36 +++ 69 .../nnacl/include/nnacl/div_fp32_simd.h | 36 +++ 70 .../nnacl/include/nnacl/dropout_fp32_simd.h | 36 +++ 71 .../nnacl/include/nnacl/exp_fp32_simd.h | 36 +++ 72 .../nnacl/include/nnacl/fill_base_simd.h | 36 +++ 73 .../include/nnacl/group_norm_fp32_simd.h | 36 +++ 74 .../include/nnacl/layer_norm_fp32_simd.h | 36 +++ 75 .../nnacl/include/nnacl/matmul_fp32_simd.h | 36 +++ 76 .../nnacl/include/nnacl/mul_fp32_simd.h | 36 +++ 77 .../include/nnacl/neon/activation_fp32_neon.h | 220 +++++++++++++++ 78 .../include/nnacl/neon/activation_grad_neon.h | 56 ++++ 79 .../nnacl/include/nnacl/neon/adam_fp32_neon.h | 209 ++++++++++++++ 80 .../nnacl/include/nnacl/neon/add_fp32_neon.h | 123 +++++++++ 81 .../include/nnacl/neon/arithmetic_fp32_neon.h | 253 +++++++++++++++++ 82 .../nnacl/neon/arithmetic_self_fp32_neon.h | 128 +++++++++ 83 .../include/nnacl/neon/batchnorm_fp32_neon.h | 66 +++++ 84 .../neon/bce_with_logits_loss_fp32_neon.h | 68 +++++ 85 .../nnacl/include/nnacl/neon/bias_add_neon.h | 63 +++++ 86 .../nnacl/include/nnacl/neon/cast_base_neon.h | 55 ++++ 87 .../include/nnacl/neon/cdist_fp32_neon.h | 69 +++++ 88 .../include/nnacl/neon/cumsum_fp32_neon.h | 120 +++++++++ 89 .../nnacl/include/nnacl/neon/div_fp32_neon.h | 166 ++++++++++++ 90 .../include/nnacl/neon/dropout_fp32_neon.h | 45 ++++ 91 .../nnacl/include/nnacl/neon/exp_fp32_neon.h | 62 +++++ 92 .../nnacl/include/nnacl/neon/fill_base_neon.h | 52 ++++ 93 .../include/nnacl/neon/group_norm_fp32_neon.h | 76 ++++++ 94 .../include/nnacl/neon/layer_norm_fp32_neon.h | 67 +++++ 95 .../include/nnacl/neon/matmul_fp32_neon.h | 92 +++++++ 96 .../nnacl/include/nnacl/neon/mul_fp32_neon.h | 217 +++++++++++++++ 97 .../include/nnacl/neon/pooling_fp32_neon.h | 83 ++++++ 98 .../include/nnacl/neon/power_fp32_neon.h | 100 +++++++ 99 .../include/nnacl/neon/reduce_fp32_neon.h | 180 +++++++++++++ 100 .../include/nnacl/neon/softmax_fp32_neon.h | 86 ++++++ 101 .../nnacl/include/nnacl/neon/sub_fp32_neon.h | 166 ++++++++++++ 102 .../nnacl/include/nnacl/pooling_fp32_simd.h | 36 +++ 103 .../nnacl/include/nnacl/power_fp32_simd.h | 36 +++ 104 .../nnacl/include/nnacl/reduce_fp32_simd.h | 36 +++ 105 .../nnacl/include/nnacl/softmax_fp32_simd.h | 36 +++ 106 .../include/nnacl/sse/activation_fp32_sse.h | 221 +++++++++++++++ 107 .../include/nnacl/sse/activation_grad_sse.h | 57 ++++ 108 .../nnacl/include/nnacl/sse/adam_fp32_sse.h | 210 +++++++++++++++ 109 .../nnacl/include/nnacl/sse/add_fp32_sse.h | 124 +++++++++ 110 .../include/nnacl/sse/arithmetic_fp32_sse.h | 254 ++++++++++++++++++ 111 .../nnacl/sse/arithmetic_self_fp32_sse.h | 129 +++++++++ 112 .../include/nnacl/sse/batchnorm_fp32_sse.h | 67 +++++ 113 .../nnacl/sse/bce_with_logits_loss_fp32_sse.h | 69 +++++ 114 .../nnacl/include/nnacl/sse/bias_add_sse.h | 64 +++++ 115 .../nnacl/include/nnacl/sse/cast_base_sse.h | 56 ++++ 116 .../nnacl/include/nnacl/sse/cdist_fp32_sse.h | 70 +++++ 117 .../nnacl/include/nnacl/sse/cumsum_fp32_sse.h | 121 +++++++++ 118 .../nnacl/include/nnacl/sse/div_fp32_sse.h | 167 ++++++++++++ 119 .../include/nnacl/sse/dropout_fp32_sse.h | 46 ++++ 120 .../nnacl/include/nnacl/sse/exp_fp32_sse.h | 63 +++++ 121 .../nnacl/include/nnacl/sse/fill_base_sse.h | 53 ++++ 122 .../include/nnacl/sse/group_norm_fp32_sse.h | 77 ++++++ 123 .../include/nnacl/sse/layer_norm_fp32_sse.h | 68 +++++ 124 .../nnacl/include/nnacl/sse/matmul_fp32_sse.h | 93 +++++++ 125 .../nnacl/include/nnacl/sse/mul_fp32_sse.h | 218 +++++++++++++++ 126 .../include/nnacl/sse/pooling_fp32_sse.h | 84 ++++++ 127 .../nnacl/include/nnacl/sse/power_fp32_sse.h | 101 +++++++ 128 .../nnacl/include/nnacl/sse/reduce_fp32_sse.h | 181 +++++++++++++ 129 .../include/nnacl/sse/softmax_fp32_sse.h | 87 ++++++ 130 .../nnacl/include/nnacl/sse/sub_fp32_sse.h | 167 ++++++++++++ 131 .../nnacl/include/nnacl/sub_fp32_simd.h | 36 +++ 132 125 files changed, 12263 insertions(+) 133 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h 134 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h 135 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h 136 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h 137 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h 138 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h 139 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h 140 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h 141 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h 142 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h 143 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h 144 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h 145 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h 146 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h 147 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h 148 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h 149 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h 150 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h 151 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h 152 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h 153 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h 154 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h 155 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h 156 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h 157 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h 158 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h 159 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h 160 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h 161 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h 162 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h 163 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h 164 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h 165 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h 166 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h 167 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h 168 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h 169 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h 170 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h 171 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h 172 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h 173 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h 174 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h 175 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h 176 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h 177 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h 178 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h 179 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h 180 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h 181 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h 182 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h 183 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h 184 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h 185 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h 186 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h 187 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h 188 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h 189 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h 190 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h 191 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h 192 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h 193 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h 194 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h 195 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h 196 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h 197 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h 198 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h 199 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h 200 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h 201 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h 202 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h 203 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h 204 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h 205 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h 206 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h 207 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h 208 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h 209 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h 210 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h 211 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h 212 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h 213 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h 214 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h 215 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h 216 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h 217 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h 218 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h 219 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h 220 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h 221 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h 222 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h 223 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h 224 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h 225 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h 226 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h 227 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h 228 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h 229 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h 230 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h 231 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h 232 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h 233 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h 234 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h 235 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h 236 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h 237 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h 238 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h 239 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h 240 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h 241 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h 242 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h 243 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h 244 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h 245 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h 246 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h 247 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h 248 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h 249 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h 250 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h 251 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h 252 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h 253 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h 254 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h 255 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h 256 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h 257 create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h 258 259diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h 260new file mode 100644 261index 00000000..fead4fd3 262--- /dev/null 263+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h 264@@ -0,0 +1,36 @@ 265+/** 266+ * Copyright 2022 Huawei Technologies Co., Ltd 267+ * 268+ * Licensed under the Apache License, Version 2.0 (the "License"); 269+ * you may not use this file except in compliance with the License. 270+ * You may obtain a copy of the License at 271+ * 272+ * http://www.apache.org/licenses/LICENSE-2.0 273+ * 274+ * Unless required by applicable law or agreed to in writing, software 275+ * distributed under the License is distributed on an "AS IS" BASIS, 276+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 277+ * See the License for the specific language governing permissions and 278+ * limitations under the License. 279+ */ 280+#ifndef MINDSPORE_NNACL_ACTIVATION_FP32_SIMD_H_ 281+#define MINDSPORE_NNACL_ACTIVATION_FP32_SIMD_H_ 282+ 283+#include "nnacl/intrinsics/ms_simd_instructions.h" 284+#ifdef ENABLE_AVX512 285+#include "nnacl/avx512/activation_fp32_avx512.h" 286+#endif 287+ 288+#ifdef ENABLE_AVX 289+#include "nnacl/avx/activation_fp32_avx.h" 290+#endif 291+ 292+#ifdef ENABLE_SSE 293+#include "nnacl/sse/activation_fp32_sse.h" 294+#endif 295+ 296+#ifdef ENABLE_ARM 297+#include "nnacl/neon/activation_fp32_neon.h" 298+#endif 299+ 300+#endif 301diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h 302new file mode 100644 303index 00000000..c8637379 304--- /dev/null 305+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h 306@@ -0,0 +1,36 @@ 307+/** 308+ * Copyright 2022 Huawei Technologies Co., Ltd 309+ * 310+ * Licensed under the Apache License, Version 2.0 (the "License"); 311+ * you may not use this file except in compliance with the License. 312+ * You may obtain a copy of the License at 313+ * 314+ * http://www.apache.org/licenses/LICENSE-2.0 315+ * 316+ * Unless required by applicable law or agreed to in writing, software 317+ * distributed under the License is distributed on an "AS IS" BASIS, 318+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 319+ * See the License for the specific language governing permissions and 320+ * limitations under the License. 321+ */ 322+#ifndef MINDSPORE_NNACL_ACTIVATION_GRAD_SIMD_H_ 323+#define MINDSPORE_NNACL_ACTIVATION_GRAD_SIMD_H_ 324+ 325+#include "nnacl/intrinsics/ms_simd_instructions.h" 326+#ifdef ENABLE_AVX512 327+#include "nnacl/avx512/activation_grad_avx512.h" 328+#endif 329+ 330+#ifdef ENABLE_AVX 331+#include "nnacl/avx/activation_grad_avx.h" 332+#endif 333+ 334+#ifdef ENABLE_SSE 335+#include "nnacl/sse/activation_grad_sse.h" 336+#endif 337+ 338+#ifdef ENABLE_ARM 339+#include "nnacl/neon/activation_grad_neon.h" 340+#endif 341+ 342+#endif 343diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h 344new file mode 100644 345index 00000000..267799ed 346--- /dev/null 347+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h 348@@ -0,0 +1,36 @@ 349+/** 350+ * Copyright 2022 Huawei Technologies Co., Ltd 351+ * 352+ * Licensed under the Apache License, Version 2.0 (the "License"); 353+ * you may not use this file except in compliance with the License. 354+ * You may obtain a copy of the License at 355+ * 356+ * http://www.apache.org/licenses/LICENSE-2.0 357+ * 358+ * Unless required by applicable law or agreed to in writing, software 359+ * distributed under the License is distributed on an "AS IS" BASIS, 360+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 361+ * See the License for the specific language governing permissions and 362+ * limitations under the License. 363+ */ 364+#ifndef MINDSPORE_NNACL_ADAM_FP32_SIMD_H_ 365+#define MINDSPORE_NNACL_ADAM_FP32_SIMD_H_ 366+ 367+#include "nnacl/intrinsics/ms_simd_instructions.h" 368+#ifdef ENABLE_AVX512 369+#include "nnacl/avx512/adam_fp32_avx512.h" 370+#endif 371+ 372+#ifdef ENABLE_AVX 373+#include "nnacl/avx/adam_fp32_avx.h" 374+#endif 375+ 376+#ifdef ENABLE_SSE 377+#include "nnacl/sse/adam_fp32_sse.h" 378+#endif 379+ 380+#ifdef ENABLE_ARM 381+#include "nnacl/neon/adam_fp32_neon.h" 382+#endif 383+ 384+#endif 385diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h 386new file mode 100644 387index 00000000..83cd76ec 388--- /dev/null 389+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h 390@@ -0,0 +1,36 @@ 391+/** 392+ * Copyright 2022 Huawei Technologies Co., Ltd 393+ * 394+ * Licensed under the Apache License, Version 2.0 (the "License"); 395+ * you may not use this file except in compliance with the License. 396+ * You may obtain a copy of the License at 397+ * 398+ * http://www.apache.org/licenses/LICENSE-2.0 399+ * 400+ * Unless required by applicable law or agreed to in writing, software 401+ * distributed under the License is distributed on an "AS IS" BASIS, 402+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 403+ * See the License for the specific language governing permissions and 404+ * limitations under the License. 405+ */ 406+#ifndef MINDSPORE_NNACL_ADD_FP32_SIMD_H_ 407+#define MINDSPORE_NNACL_ADD_FP32_SIMD_H_ 408+ 409+#include "nnacl/intrinsics/ms_simd_instructions.h" 410+#ifdef ENABLE_AVX512 411+#include "nnacl/avx512/add_fp32_avx512.h" 412+#endif 413+ 414+#ifdef ENABLE_AVX 415+#include "nnacl/avx/add_fp32_avx.h" 416+#endif 417+ 418+#ifdef ENABLE_SSE 419+#include "nnacl/sse/add_fp32_sse.h" 420+#endif 421+ 422+#ifdef ENABLE_ARM 423+#include "nnacl/neon/add_fp32_neon.h" 424+#endif 425+ 426+#endif 427diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h 428new file mode 100644 429index 00000000..898fe882 430--- /dev/null 431+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h 432@@ -0,0 +1,36 @@ 433+/** 434+ * Copyright 2022 Huawei Technologies Co., Ltd 435+ * 436+ * Licensed under the Apache License, Version 2.0 (the "License"); 437+ * you may not use this file except in compliance with the License. 438+ * You may obtain a copy of the License at 439+ * 440+ * http://www.apache.org/licenses/LICENSE-2.0 441+ * 442+ * Unless required by applicable law or agreed to in writing, software 443+ * distributed under the License is distributed on an "AS IS" BASIS, 444+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 445+ * See the License for the specific language governing permissions and 446+ * limitations under the License. 447+ */ 448+#ifndef MINDSPORE_NNACL_ARITHMETIC_FP32_SIMD_H_ 449+#define MINDSPORE_NNACL_ARITHMETIC_FP32_SIMD_H_ 450+ 451+#include "nnacl/intrinsics/ms_simd_instructions.h" 452+#ifdef ENABLE_AVX512 453+#include "nnacl/avx512/arithmetic_fp32_avx512.h" 454+#endif 455+ 456+#ifdef ENABLE_AVX 457+#include "nnacl/avx/arithmetic_fp32_avx.h" 458+#endif 459+ 460+#ifdef ENABLE_SSE 461+#include "nnacl/sse/arithmetic_fp32_sse.h" 462+#endif 463+ 464+#ifdef ENABLE_ARM 465+#include "nnacl/neon/arithmetic_fp32_neon.h" 466+#endif 467+ 468+#endif 469diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h 470new file mode 100644 471index 00000000..676b53ec 472--- /dev/null 473+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h 474@@ -0,0 +1,36 @@ 475+/** 476+ * Copyright 2022 Huawei Technologies Co., Ltd 477+ * 478+ * Licensed under the Apache License, Version 2.0 (the "License"); 479+ * you may not use this file except in compliance with the License. 480+ * You may obtain a copy of the License at 481+ * 482+ * http://www.apache.org/licenses/LICENSE-2.0 483+ * 484+ * Unless required by applicable law or agreed to in writing, software 485+ * distributed under the License is distributed on an "AS IS" BASIS, 486+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 487+ * See the License for the specific language governing permissions and 488+ * limitations under the License. 489+ */ 490+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_FP32_SIMD_H_ 491+#define MINDSPORE_NNACL_ARITHMETIC_SELF_FP32_SIMD_H_ 492+ 493+#include "nnacl/intrinsics/ms_simd_instructions.h" 494+#ifdef ENABLE_AVX512 495+#include "nnacl/avx512/arithmetic_self_fp32_avx512.h" 496+#endif 497+ 498+#ifdef ENABLE_AVX 499+#include "nnacl/avx/arithmetic_self_fp32_avx.h" 500+#endif 501+ 502+#ifdef ENABLE_SSE 503+#include "nnacl/sse/arithmetic_self_fp32_sse.h" 504+#endif 505+ 506+#ifdef ENABLE_ARM 507+#include "nnacl/neon/arithmetic_self_fp32_neon.h" 508+#endif 509+ 510+#endif 511diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h 512new file mode 100644 513index 00000000..49edf7ec 514--- /dev/null 515+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h 516@@ -0,0 +1,221 @@ 517+/** 518+ * Copyright 2022 Huawei Technologies Co., Ltd 519+ * 520+ * Licensed under the Apache License, Version 2.0 (the "License"); 521+ * you may not use this file except in compliance with the License. 522+ * You may obtain a copy of the License at 523+ * 524+ * http://www.apache.org/licenses/LICENSE-2.0 525+ * 526+ * Unless required by applicable law or agreed to in writing, software 527+ * distributed under the License is distributed on an "AS IS" BASIS, 528+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 529+ * See the License for the specific language governing permissions and 530+ * limitations under the License. 531+ */ 532+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ 533+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ 534+ 535+#include "nnacl/intrinsics/ms_simd_instructions.h" 536+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 537+ 538+#ifdef __cplusplus 539+extern "C" { 540+#endif 541+#pragma GCC push_options 542+#pragma GCC target("avx", "avx2") 543+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 544+#define BLOCK_NUM 8 545+#define MS_SIMD_AVX 546+ 547+static inline int Fp32ReluAVX(int index, const float *src, int length, float *dst) { 548+ SIMD_F32 zero = SIMD_SET0_F32; 549+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 550+ SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero)); 551+ } 552+ return index; 553+} 554+ 555+static inline int Int32ReluAVX(int index, const int32_t *src, int length, int32_t *dst) { 556+ SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f); 557+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 558+ SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero)); 559+ } 560+ return index; 561+} 562+ 563+static inline int Fp32Relu6AVX(int index, const float *src, int length, float *dst) { 564+ SIMD_F32 zero = SIMD_SET0_F32; 565+ SIMD_F32 six = SIMD_MOV_F32(6.0f); 566+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 567+ SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six)); 568+ } 569+ return index; 570+} 571+ 572+static inline int LReluAVX(int index, const float *src, int length, float *dst, float alpha) { 573+ SIMD_F32 alpha_data = SIMD_MOV_F32(alpha); 574+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 575+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 576+ SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp); 577+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask)); 578+ } 579+ return index; 580+} 581+ 582+static inline int SigmoidAVX(int index, const float *src, int length, float *dst) { 583+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 584+ SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index); 585+ SIMD_ST_F32(dst + index, 586+ SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); 587+ } 588+ return index; 589+} 590+ 591+static inline int TanhAVX(int index, const float *src, int length, float *dst) { 592+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 593+ SIMD_F32 input = SIMD_LD_F32(src + index); 594+ SIMD_ST_F32(dst + index, SIMD_TANH_F32(input)); 595+ } 596+ return index; 597+} 598+ 599+static inline int SwishAVX(int index, const float *src, int length, float *dst) { 600+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 601+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 602+ SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index); 603+ SIMD_ST_F32(dst + index, 604+ SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); 605+ } 606+ return index; 607+} 608+ 609+static inline int HSwishAVX(int index, const float *src, int length, float *dst) { 610+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 611+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 612+ SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); 613+ SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6)); 614+ } 615+ return index; 616+} 617+ 618+static inline int HSigmoidAVX(int index, const float *src, int length, float *dst) { 619+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 620+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 621+ SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); 622+ SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6)); 623+ } 624+ return index; 625+} 626+ 627+static inline int HardTanhNoLimitMinAVX(int index, const float *src, int length, float *dst, float min_val, 628+ float max_val) { 629+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 630+ SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val)); 631+ } 632+ return index; 633+} 634+ 635+static inline int HardTanhNoLimitMaxAVX(int index, const float *src, int length, float *dst, float min_val, 636+ float max_val) { 637+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 638+ SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val)); 639+ } 640+ return index; 641+} 642+ 643+static inline int HardTanhLimitMinMaxAVX(int index, const float *src, int length, float *dst, float min_val, 644+ float max_val) { 645+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 646+ SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val)); 647+ } 648+ return index; 649+} 650+ 651+static inline int GeluApproximateAVX(int index, const float *src, int length, float *dst) { 652+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 653+ SIMD_F32 in = SIMD_LD_F32(src + index); 654+ SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in); 655+ SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in); 656+ SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f))); 657+ } 658+ return index; 659+} 660+ 661+static inline int GeluAVX(int index, const float *src, int length, float *dst) { 662+ SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f); 663+ SIMD_F32 para2 = SIMD_MOV_F32(1.0f); 664+ SIMD_F32 para3 = SIMD_MOV_F32(0.5f); 665+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 666+ SIMD_F32 in = SIMD_LD_F32(src + index); 667+ SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1)))); 668+ SIMD_ST_F32(dst + index, res); 669+ } 670+ return index; 671+} 672+ 673+static inline int EluAVX(int index, const float *src, int length, float *dst, float alpha) { 674+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 675+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 676+ SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f); 677+ SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); 678+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); 679+ } 680+ return index; 681+} 682+ 683+static inline int CeluAVX(int index, const float *src, int length, float *dst, float alpha) { 684+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 685+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 686+ SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f); 687+ SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); 688+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); 689+ } 690+ return index; 691+} 692+ 693+static inline int HShrinkAVX(int index, const float *src, int length, float *dst, float lambd) { 694+ const float neg_lambd = -1 * lambd; 695+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 696+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 697+ SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd)); 698+ SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp); 699+ SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); 700+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask)); 701+ } 702+ return index; 703+} 704+ 705+static inline int SoftShrinkAVX(int index, const float *src, int length, float *dst, float lambd) { 706+ SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); 707+ SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); 708+ 709+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 710+ SIMD_F32 src_t = SIMD_LD_F32(src + index); 711+ /* v0 = (in > lamdb) & (in - lamdb) */ 712+ SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v)); 713+ /* v1 = (in < -lamdb) & (in + lamdb) */ 714+ SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v)); 715+ /* out = (v0 | v1) */ 716+ SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1)); 717+ } 718+ return index; 719+} 720+ 721+static inline int SoftsignFp32OptAVX(int index, const float *src, int length, float *dst) { 722+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 723+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 724+ SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp)); 725+ SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp)); 726+ } 727+ return index; 728+} 729+ 730+#undef MS_SIMD_INSTRUCTION 731+#undef BLOCK_NUM 732+#pragma GCC pop_options 733+#undef MS_SIMD_AVX 734+#ifdef __cplusplus 735+} 736+#endif 737+#endif 738diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h 739new file mode 100644 740index 00000000..435d24c5 741--- /dev/null 742+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h 743@@ -0,0 +1,57 @@ 744+/** 745+ * Copyright 2022 Huawei Technologies Co., Ltd 746+ * 747+ * Licensed under the Apache License, Version 2.0 (the "License"); 748+ * you may not use this file except in compliance with the License. 749+ * You may obtain a copy of the License at 750+ * 751+ * http://www.apache.org/licenses/LICENSE-2.0 752+ * 753+ * Unless required by applicable law or agreed to in writing, software 754+ * distributed under the License is distributed on an "AS IS" BASIS, 755+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 756+ * See the License for the specific language governing permissions and 757+ * limitations under the License. 758+ */ 759+ 760+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX_H_ 761+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX_H_ 762+ 763+#include "nnacl/intrinsics/ms_simd_instructions.h" 764+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 765+ 766+#ifdef __cplusplus 767+extern "C" { 768+#endif 769+#pragma GCC push_options 770+#pragma GCC target("avx", "avx2") 771+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 772+#define BLOCK_NUM 8 773+#define MS_SIMD_AVX 774+ 775+static inline int ShrinkGradAVX(int index, const float *src0, const float *src1, 776+ int length, float *dst, float lambd) { 777+ SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); 778+ SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); 779+ 780+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 781+ SIMD_F32 src0_t = SIMD_LD_F32(src0 + index); 782+ SIMD_F32 src1_t = SIMD_LD_F32(src1 + index); 783+ 784+ SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v); 785+ SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t); 786+ SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); 787+ 788+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask)); 789+ } 790+ return index; 791+} 792+ 793+#undef MS_SIMD_INSTRUCTION 794+#undef BLOCK_NUM 795+#pragma GCC pop_options 796+#undef MS_SIMD_AVX 797+#ifdef __cplusplus 798+} 799+#endif 800+#endif 801diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h 802new file mode 100644 803index 00000000..54743d80 804--- /dev/null 805+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h 806@@ -0,0 +1,210 @@ 807+/** 808+ * Copyright 2022 Huawei Technologies Co., Ltd 809+ * 810+ * Licensed under the Apache License, Version 2.0 (the "License"); 811+ * you may not use this file except in compliance with the License. 812+ * You may obtain a copy of the License at 813+ * 814+ * http://www.apache.org/licenses/LICENSE-2.0 815+ * 816+ * Unless required by applicable law or agreed to in writing, software 817+ * distributed under the License is distributed on an "AS IS" BASIS, 818+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 819+ * See the License for the specific language governing permissions and 820+ * limitations under the License. 821+ */ 822+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_AVX_H_ 823+#define MINDSPORE_NNACL_FP32_ADAM_FP32_AVX_H_ 824+ 825+#include "nnacl/intrinsics/ms_simd_instructions.h" 826+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 827+ 828+#ifdef __cplusplus 829+extern "C" { 830+#endif 831+#pragma GCC push_options 832+#pragma GCC target("avx", "avx2") 833+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 834+#define BLOCK_NUM 8 835+#define MS_SIMD_AVX 836+#ifdef MS_SIMD_AVX512 837+ static inline size_t AdamWeightDecayFp32AVX(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 838+ const float *gradient, size_t end) { 839+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 840+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 841+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 842+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 843+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 844+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 845+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 846+ 847+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 848+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 849+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 850+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 851+ SIMD_F32 g_r = SIMD_LD_F32(gradient + index); 852+ 853+ m_r = SIMD_MUL_F32(m_r, beta1_r); 854+ v_r = SIMD_MUL_F32(v_r, beta2_r); 855+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 856+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 857+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 858+ avx_r0 = SIMD_SQRT_F32(v_r); 859+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 860+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 861+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 862+ SIMD_ST_F32(m + index, m_r); 863+ SIMD_ST_F32(v + index, v_r); 864+ SIMD_ST_F32(var + index, var_r); 865+ } 866+ 867+ return index; 868+} 869+ 870+static inline size_t FusedCastAdamFp32Fp16AVX(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 871+ float global_norm_reciprocal, size_t end) { 872+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 873+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 874+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 875+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 876+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 877+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 878+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 879+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 880+ 881+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 882+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 883+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 884+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 885+ SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); 886+ 887+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 888+ m_r = SIMD_MUL_F32(m_r, beta1_r); 889+ v_r = SIMD_MUL_F32(v_r, beta2_r); 890+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 891+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 892+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 893+ avx_r0 = SIMD_SQRT_F32(v_r); 894+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 895+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 896+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 897+ SIMD_ST_F32(var + index, var_r); 898+ SIMD_ST_F32(m + index, m_r); 899+ SIMD_ST_F32(v + index, v_r); 900+ } 901+ 902+ return index; 903+} 904+ 905+static inline size_t FusedCastAdamFp32Fp32AVX(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 906+ float global_norm_reciprocal, size_t end) { 907+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 908+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 909+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 910+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 911+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 912+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 913+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 914+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 915+ 916+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 917+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 918+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 919+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 920+ SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); 921+ 922+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 923+ m_r = SIMD_MUL_F32(m_r, beta1_r); 924+ v_r = SIMD_MUL_F32(v_r, beta2_r); 925+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 926+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 927+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 928+ avx_r0 = SIMD_SQRT_F32(v_r); 929+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 930+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 931+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 932+ SIMD_ST_F32(var + index, var_r); 933+ SIMD_ST_F32(m + index, m_r); 934+ SIMD_ST_F32(v + index, v_r); 935+ } 936+ 937+ return index; 938+} 939+ 940+static inline size_t FusedCastAdamFp16Fp16AVX(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 941+ float global_norm_reciprocal, size_t end) { 942+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 943+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 944+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 945+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 946+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 947+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 948+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 949+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 950+ 951+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 952+ SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); 953+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 954+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 955+ SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); 956+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 957+ m_r = SIMD_MUL_F32(m_r, beta1_r); 958+ v_r = SIMD_MUL_F32(v_r, beta2_r); 959+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 960+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 961+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 962+ avx_r0 = SIMD_SQRT_F32(v_r); 963+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 964+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 965+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 966+ SIMD_ST_F32(m + index, m_r); 967+ SIMD_ST_F32(v + index, v_r); 968+ SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); 969+ } 970+ 971+ return index; 972+} 973+ 974+static inline size_t FusedCastAdamFp16Fp32AVX(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 975+ float global_norm_reciprocal, size_t end) { 976+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 977+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 978+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 979+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 980+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 981+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 982+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 983+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 984+ 985+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 986+ SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); 987+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 988+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 989+ SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); 990+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 991+ m_r = SIMD_MUL_F32(m_r, beta1_r); 992+ v_r = SIMD_MUL_F32(v_r, beta2_r); 993+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 994+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 995+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 996+ avx_r0 = SIMD_SQRT_F32(v_r); 997+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 998+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 999+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 1000+ SIMD_ST_F32(m + index, m_r); 1001+ SIMD_ST_F32(v + index, v_r); 1002+ SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); 1003+ } 1004+ 1005+ return index; 1006+} 1007+#endif 1008+ 1009+#undef MS_SIMD_INSTRUCTION 1010+#undef BLOCK_NUM 1011+#pragma GCC pop_options 1012+#undef MS_SIMD_AVX 1013+#ifdef __cplusplus 1014+} 1015+#endif 1016+#endif 1017diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h 1018new file mode 100644 1019index 00000000..716c25b1 1020--- /dev/null 1021+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h 1022@@ -0,0 +1,124 @@ 1023+/** 1024+ * Copyright 2022 Huawei Technologies Co., Ltd 1025+ * 1026+ * Licensed under the Apache License, Version 2.0 (the "License"); 1027+ * you may not use this file except in compliance with the License. 1028+ * You may obtain a copy of the License at 1029+ * 1030+ * http://www.apache.org/licenses/LICENSE-2.0 1031+ * 1032+ * Unless required by applicable law or agreed to in writing, software 1033+ * distributed under the License is distributed on an "AS IS" BASIS, 1034+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1035+ * See the License for the specific language governing permissions and 1036+ * limitations under the License. 1037+ */ 1038+ 1039+#ifndef MINDSPORE_NNACL_FP32_ADD_AVX_H_ 1040+#define MINDSPORE_NNACL_FP32_ADD_AVX_H_ 1041+ 1042+#include "nnacl/intrinsics/ms_simd_instructions.h" 1043+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 1044+ 1045+#ifdef __cplusplus 1046+extern "C" { 1047+#endif 1048+#pragma GCC push_options 1049+#pragma GCC target("avx", "avx2") 1050+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 1051+#define BLOCK_NUM 8 1052+#define MS_SIMD_AVX 1053+ 1054+static inline int ElementOptAddAVX(int index, const float *in0, const float *in1, float *out, int size) { 1055+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 1056+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1057+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 1058+ SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1); 1059+ SIMD_ST_F32(out + index, vout); 1060+ } 1061+ return index; 1062+} 1063+ 1064+static inline int ElementOptAddIntAVX(int index, const int *in0, const int *in1, int *out, 1065+ int size) { 1066+ SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]); 1067+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1068+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 1069+ SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1); 1070+ SIMD_ST_EPI32(out + index, vout); 1071+ } 1072+ return index; 1073+} 1074+ 1075+static inline int ElementOptAddReluAVX(int index, const float *in0, const float *in1, float *out, 1076+ int size) { 1077+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 1078+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1079+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 1080+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f); 1081+ SIMD_ST_F32(out + index, vout); 1082+ } 1083+ return index; 1084+} 1085+ 1086+static inline int ElementOptAddRelu6AVX(int index, const float *in0, const float *in1, float *out, 1087+ int size) { 1088+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 1089+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1090+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 1091+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f); 1092+ SIMD_ST_F32(out + index, vout); 1093+ } 1094+ return index; 1095+} 1096+ 1097+static inline int ElementAddAVX(int index, const float *in0, const float *in1, float *out, int size) { 1098+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1099+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 1100+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 1101+ SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); 1102+ SIMD_ST_F32(out + index, vout); 1103+ } 1104+ return index; 1105+} 1106+ 1107+static inline int ElementAddReluAVX(int index, const float *in0, const float *in1, float *out, 1108+ int size) { 1109+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1110+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 1111+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 1112+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f); 1113+ SIMD_ST_F32(out + index, vout); 1114+ } 1115+ return index; 1116+} 1117+ 1118+static inline int ElementAddRelu6AVX(int index, const float *in0, const float *in1, float *out, 1119+ int size) { 1120+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1121+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 1122+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 1123+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f); 1124+ SIMD_ST_F32(out + index, vout); 1125+ } 1126+ return index; 1127+} 1128+ 1129+static inline int ElementAddIntAVX(int index, const int *in0, const int *in1, int *out, int size) { 1130+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1131+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 1132+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 1133+ SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1); 1134+ SIMD_ST_EPI32(out + index, vout); 1135+ } 1136+ return index; 1137+} 1138+ 1139+#undef MS_SIMD_INSTRUCTION 1140+#undef BLOCK_NUM 1141+#pragma GCC pop_options 1142+#undef MS_SIMD_AVX 1143+#ifdef __cplusplus 1144+} 1145+#endif 1146+#endif 1147diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h 1148new file mode 100644 1149index 00000000..9dd24100 1150--- /dev/null 1151+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h 1152@@ -0,0 +1,254 @@ 1153+/** 1154+ * Copyright 2022 Huawei Technologies Co., Ltd 1155+ * 1156+ * Licensed under the Apache License, Version 2.0 (the "License"); 1157+ * you may not use this file except in compliance with the License. 1158+ * You may obtain a copy of the License at 1159+ * 1160+ * http://www.apache.org/licenses/LICENSE-2.0 1161+ * 1162+ * Unless required by applicable law or agreed to in writing, software 1163+ * distributed under the License is distributed on an "AS IS" BASIS, 1164+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1165+ * See the License for the specific language governing permissions and 1166+ * limitations under the License. 1167+ */ 1168+ 1169+#ifndef MINDSPORE_NNACL_ARITHMETIC_AVX_H_ 1170+#define MINDSPORE_NNACL_ARITHMETIC_AVX_H_ 1171+ 1172+#include "nnacl/intrinsics/ms_simd_instructions.h" 1173+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 1174+ 1175+#ifdef __cplusplus 1176+extern "C" { 1177+#endif 1178+#pragma GCC push_options 1179+#pragma GCC target("avx", "avx2") 1180+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 1181+#define BLOCK_NUM 8 1182+#define MS_SIMD_AVX 1183+ 1184+#ifndef MS_SIMD_NEON 1185+static inline int ElementFloorModAVX(int index, const float *in0, const float *in1, float *out, int size) { 1186+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1187+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 1188+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 1189+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 1190+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 1191+ SIMD_ST_F32(out + index, out_tmp); 1192+ } 1193+ return index; 1194+} 1195+ 1196+static inline int ElementOptFloorModNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { 1197+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 1198+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1199+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 1200+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 1201+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 1202+ SIMD_ST_F32(out + index, out_tmp); 1203+ } 1204+ return index; 1205+} 1206+ 1207+static inline int ElementOptFloorModNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { 1208+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 1209+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1210+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 1211+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 1212+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 1213+ SIMD_ST_F32(out + index, out_tmp); 1214+ } 1215+ return index; 1216+} 1217+ 1218+static inline int ElementFloorDivAVX(int index, const float *in0, const float *in1, float *out, int size) { 1219+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1220+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 1221+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 1222+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 1223+ SIMD_ST_F32(out + index, floor_tmp); 1224+ } 1225+ return index; 1226+} 1227+ 1228+static inline int ElementOptFloorDivNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { 1229+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 1230+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1231+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 1232+ SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 1233+ SIMD_ST_F32(out + index, out_tmp); 1234+ } 1235+ return index; 1236+} 1237+ 1238+static inline int ElementOptFloorDivNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { 1239+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 1240+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1241+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 1242+ SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 1243+ SIMD_ST_F32(out + index, out_tmp); 1244+ } 1245+ return index; 1246+} 1247+#endif 1248+ 1249+static inline int ElementFloorDivIntAVX(int index, const int *in0, const int *in1, int *out, int size) { 1250+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1251+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 1252+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 1253+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 1254+ SIMD_ST_EPI32(out + index, out_tmp); 1255+ } 1256+ return index; 1257+} 1258+ 1259+static inline int ElementOptFloorDivIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { 1260+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 1261+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1262+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 1263+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 1264+ SIMD_ST_EPI32(out + index, out_tmp); 1265+ } 1266+ return index; 1267+} 1268+ 1269+static inline int ElementOptFloorDivIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { 1270+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 1271+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1272+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 1273+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 1274+ SIMD_ST_EPI32(out + index, out_tmp); 1275+ } 1276+ return index; 1277+} 1278+ 1279+static inline int ElementMaximumAVX(int index, const float *in0, const float *in1, float *out, int size) { 1280+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1281+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 1282+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 1283+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 1284+ SIMD_ST_F32(out + index, out_tmp); 1285+ } 1286+ return index; 1287+} 1288+ 1289+static inline int ElementOptMaximumNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { 1290+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 1291+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1292+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 1293+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 1294+ SIMD_ST_F32(out + index, out_tmp); 1295+ } 1296+ return index; 1297+} 1298+ 1299+static inline int ElementOptMaximumNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { 1300+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 1301+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1302+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 1303+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 1304+ SIMD_ST_F32(out + index, out_tmp); 1305+ } 1306+ return index; 1307+} 1308+ 1309+static inline int ElementMaximumIntAVX(int index, const int *in0, const int *in1, int *out, int size) { 1310+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1311+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 1312+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 1313+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 1314+ SIMD_ST_EPI32(out + index, out_tmp); 1315+ } 1316+ return index; 1317+} 1318+ 1319+static inline int ElementOptMaximumIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { 1320+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 1321+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1322+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 1323+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 1324+ SIMD_ST_EPI32(out + index, out_tmp); 1325+ } 1326+ return index; 1327+} 1328+ 1329+static inline int ElementOptMaximumIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { 1330+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 1331+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1332+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 1333+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 1334+ SIMD_ST_EPI32(out + index, out_tmp); 1335+ } 1336+ return index; 1337+} 1338+ 1339+static inline int ElementMinimumIntAVX(int index, const int *in0, const int *in1, int *out, int size) { 1340+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1341+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 1342+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 1343+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 1344+ SIMD_ST_EPI32(out + index, out_tmp); 1345+ } 1346+ return index; 1347+} 1348+ 1349+static inline int ElementOptMinimumIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { 1350+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 1351+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1352+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 1353+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 1354+ SIMD_ST_EPI32(out + index, out_tmp); 1355+ } 1356+ return index; 1357+} 1358+ 1359+static inline int ElementOptMinimumIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { 1360+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 1361+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1362+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 1363+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 1364+ SIMD_ST_EPI32(out + index, out_tmp); 1365+ } 1366+ return index; 1367+} 1368+ 1369+static inline int ElementMinimumAVX(int index, const float *in0, const float *in1, float *out, int size) { 1370+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1371+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 1372+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 1373+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 1374+ SIMD_ST_F32(out + index, out_tmp); 1375+ } 1376+ return index; 1377+} 1378+ 1379+static inline int ElementOptMinimumNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { 1380+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 1381+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1382+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 1383+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 1384+ SIMD_ST_F32(out + index, out_tmp); 1385+ } 1386+ return index; 1387+} 1388+ 1389+static inline int ElementOptMinimumNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { 1390+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 1391+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1392+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 1393+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 1394+ SIMD_ST_F32(out + index, out_tmp); 1395+ } 1396+ return index; 1397+} 1398+ 1399+#undef MS_SIMD_INSTRUCTION 1400+#undef BLOCK_NUM 1401+#pragma GCC pop_options 1402+#undef MS_SIMD_AVX 1403+#ifdef __cplusplus 1404+} 1405+#endif 1406+#endif 1407diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h 1408new file mode 100644 1409index 00000000..c48500f4 1410--- /dev/null 1411+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h 1412@@ -0,0 +1,129 @@ 1413+/** 1414+ * Copyright 2022 Huawei Technologies Co., Ltd 1415+ * 1416+ * Licensed under the Apache License, Version 2.0 (the "License"); 1417+ * you may not use this file except in compliance with the License. 1418+ * You may obtain a copy of the License at 1419+ * 1420+ * http://www.apache.org/licenses/LICENSE-2.0 1421+ * 1422+ * Unless required by applicable law or agreed to in writing, software 1423+ * distributed under the License is distributed on an "AS IS" BASIS, 1424+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1425+ * See the License for the specific language governing permissions and 1426+ * limitations under the License. 1427+ */ 1428+ 1429+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_AVX_H_ 1430+#define MINDSPORE_NNACL_ARITHMETIC_SELF_AVX_H_ 1431+ 1432+#include "nnacl/intrinsics/ms_simd_instructions.h" 1433+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 1434+ 1435+#ifdef __cplusplus 1436+extern "C" { 1437+#endif 1438+#pragma GCC push_options 1439+#pragma GCC target("avx", "avx2") 1440+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 1441+#define BLOCK_NUM 8 1442+#define MS_SIMD_AVX 1443+ 1444+#if defined(MS_SIMD_AVX512) 1445+// only avx512 support abs fp32 instruction 1446+static inline int ElementAbsAVX(int index, const float *input, float *output, const int element_size) { 1447+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1448+ SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index))); 1449+ } 1450+ return index; 1451+} 1452+ 1453+static inline int ElementAbsIntAVX(int index, const int *input, int *output, const int element_size) { 1454+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1455+ SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index))); 1456+ } 1457+ return index; 1458+} 1459+#endif 1460+ 1461+static inline int ElementSquareAVX(int index, const float *input, float *output, const int element_size) { 1462+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1463+ SIMD_F32 vin = SIMD_LD_F32(input + index); 1464+ SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin)); 1465+ } 1466+ return index; 1467+} 1468+ 1469+static inline int ElementSqrtAVX(int index, const float *input, float *output, const int element_size) { 1470+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1471+ SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index))); 1472+ } 1473+ return index; 1474+} 1475+ 1476+static inline int ElementRsqrtAVX(int index, const float *input, float *output, const int element_size) { 1477+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1478+ SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index))); 1479+ } 1480+ return index; 1481+} 1482+ 1483+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE) 1484+// avx512 dont support round fp32 instruction 1485+static inline int ElementRoundAVX(int index, const float *input, float *output, const int element_size) { 1486+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1487+ SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index))); 1488+ } 1489+ return index; 1490+} 1491+#endif 1492+ 1493+#ifndef MS_SIMD_NEON 1494+// neon dont support floor fp32 instruction 1495+static inline int ElementFloorAVX(int index, const float *input, float *output, const int element_size) { 1496+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1497+ SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index))); 1498+ } 1499+ return index; 1500+} 1501+#endif 1502+ 1503+#ifndef MS_SIMD_NEON 1504+static inline int ElementCeilAVX(int index, const float *input, float *output, const int element_size) { 1505+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1506+ SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index))); 1507+ } 1508+ return index; 1509+} 1510+#endif 1511+ 1512+static inline int ElementNegativeAVX(int index, const float *input, float *output, const int element_size) { 1513+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1514+ SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f)); 1515+ } 1516+ return index; 1517+} 1518+ 1519+static inline int ElementNegativeIntAVX(int index, const int *input, int *output, const int element_size) { 1520+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1521+ SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1)); 1522+ } 1523+ return index; 1524+} 1525+ 1526+static inline int ElementReciprocalAVX(int index, const float *input, float *output, const int element_size) { 1527+ SIMD_F32 num1 = SIMD_MOV_F32(1.0f); 1528+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1529+ SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index))); 1530+ } 1531+ return index; 1532+} 1533+ 1534+#undef MS_SIMD_INSTRUCTION 1535+#undef BLOCK_NUM 1536+#pragma GCC pop_options 1537+#undef MS_SIMD_AVX 1538+#ifdef __cplusplus 1539+} 1540+#endif 1541+#endif 1542diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h 1543new file mode 100644 1544index 00000000..11a9087b 1545--- /dev/null 1546+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h 1547@@ -0,0 +1,67 @@ 1548+/** 1549+ * Copyright 2022 Huawei Technologies Co., Ltd 1550+ * 1551+ * Licensed under the Apache License, Version 2.0 (the "License"); 1552+ * you may not use this file except in compliance with the License. 1553+ * You may obtain a copy of the License at 1554+ * 1555+ * http://www.apache.org/licenses/LICENSE-2.0 1556+ * 1557+ * Unless required by applicable law or agreed to in writing, software 1558+ * distributed under the License is distributed on an "AS IS" BASIS, 1559+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1560+ * See the License for the specific language governing permissions and 1561+ * limitations under the License. 1562+ */ 1563+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ 1564+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ 1565+ 1566+#include "nnacl/intrinsics/ms_simd_instructions.h" 1567+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 1568+ 1569+#ifdef __cplusplus 1570+extern "C" { 1571+#endif 1572+#pragma GCC push_options 1573+#pragma GCC target("avx", "avx2") 1574+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 1575+#define BLOCK_NUM 8 1576+#define MS_SIMD_AVX 1577+ 1578+static inline int BatchNormFp32AVX(int index, const float *input, const float *mean, 1579+ const float *variance, int channel, float epsilon, float *output) { 1580+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1581+ SIMD_F32 input_data = SIMD_LD_F32(input + index); 1582+ SIMD_F32 mean_ = SIMD_LD_F32(mean + index); 1583+ SIMD_F32 variance_ = SIMD_LD_F32(variance + index); 1584+ SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); 1585+ SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); 1586+ SIMD_ST_F32(output + index, output_data); 1587+ } 1588+ return index; 1589+} 1590+ 1591+static inline int FusedBatchNormFp32AVX(int index, const float *input, const float *scale, 1592+ const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) { 1593+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1594+ SIMD_F32 input_data = SIMD_LD_F32(input + index); 1595+ SIMD_F32 scale_ = SIMD_LD_F32(scale + index); 1596+ SIMD_F32 offset_ = SIMD_LD_F32(offset + index); 1597+ SIMD_F32 mean_ = SIMD_LD_F32(mean + index); 1598+ SIMD_F32 variance_ = SIMD_LD_F32(variance + index); 1599+ SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); 1600+ SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); 1601+ SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_); 1602+ SIMD_ST_F32(output + index, output_data); 1603+ } 1604+ return index; 1605+} 1606+ 1607+#undef MS_SIMD_INSTRUCTION 1608+#undef BLOCK_NUM 1609+#pragma GCC pop_options 1610+#undef MS_SIMD_AVX 1611+#ifdef __cplusplus 1612+} 1613+#endif 1614+#endif 1615diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h 1616new file mode 100644 1617index 00000000..9da68a79 1618--- /dev/null 1619+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h 1620@@ -0,0 +1,69 @@ 1621+/** 1622+ * Copyright 2022 Huawei Technologies Co., Ltd 1623+ * 1624+ * Licensed under the Apache License, Version 2.0 (the "License"); 1625+ * you may not use this file except in compliance with the License. 1626+ * You may obtain a copy of the License at 1627+ * 1628+ * http://www.apache.org/licenses/LICENSE-2.0 1629+ * 1630+ * Unless required by applicable law or agreed to in writing, software 1631+ * distributed under the License is distributed on an "AS IS" BASIS, 1632+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1633+ * See the License for the specific language governing permissions and 1634+ * limitations under the License. 1635+ */ 1636+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX_H_ 1637+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX_H_ 1638+ 1639+#include "nnacl/intrinsics/ms_simd_instructions.h" 1640+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 1641+ 1642+#ifdef __cplusplus 1643+extern "C" { 1644+#endif 1645+#pragma GCC push_options 1646+#pragma GCC target("avx", "avx2") 1647+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 1648+#define BLOCK_NUM 8 1649+#define MS_SIMD_AVX 1650+ 1651+static inline int BCEWithLogitLossAVX(int index, const float *logits, const float *label, 1652+ const float *weight, const float *pos_weight, int length, bool reduction, float *output, 1653+ float *reduction_sum) { 1654+ SIMD_F32 zero = SIMD_SET0_F32; 1655+ SIMD_F32 ones = SIMD_MOV_F32(1.0f); 1656+ SIMD_F32 middle_output = SIMD_SET0_F32; 1657+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1658+ SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index); 1659+ SIMD_F32 label_tmp = SIMD_LD_F32(label + index); 1660+ SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index); 1661+ SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index); 1662+ SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp); 1663+ SIMD_F32 max_value = neg_logits_tmp; 1664+ max_value = SIMD_MIN_F32(max_value, zero); 1665+ SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value); 1666+ SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones); 1667+ SIMD_F32 log_exp_value = 1668+ SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value)))); 1669+ SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp), 1670+ SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value))); 1671+ if (reduction) { 1672+ middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output); 1673+ } else { 1674+ SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp)); 1675+ } 1676+ } 1677+ if (reduction) { 1678+ *reduction_sum += SIMD_GET_SUM_F32(middle_output); 1679+ } 1680+ return index; 1681+} 1682+#undef MS_SIMD_INSTRUCTION 1683+#undef BLOCK_NUM 1684+#pragma GCC pop_options 1685+#undef MS_SIMD_AVX 1686+#ifdef __cplusplus 1687+} 1688+#endif 1689+#endif 1690diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h 1691new file mode 100644 1692index 00000000..e54588bb 1693--- /dev/null 1694+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h 1695@@ -0,0 +1,64 @@ 1696+/** 1697+ * Copyright 2022 Huawei Technologies Co., Ltd 1698+ * 1699+ * Licensed under the Apache License, Version 2.0 (the "License"); 1700+ * you may not use this file except in compliance with the License. 1701+ * You may obtain a copy of the License at 1702+ * 1703+ * http://www.apache.org/licenses/LICENSE-2.0 1704+ * 1705+ * Unless required by applicable law or agreed to in writing, software 1706+ * distributed under the License is distributed on an "AS IS" BASIS, 1707+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1708+ * See the License for the specific language governing permissions and 1709+ * limitations under the License. 1710+ */ 1711+ 1712+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_AVX_H_ 1713+#define MINDSPORE_NNACL_FP32_BIAS_ADD_AVX_H_ 1714+ 1715+#include "nnacl/intrinsics/ms_simd_instructions.h" 1716+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 1717+ 1718+#ifdef __cplusplus 1719+extern "C" { 1720+#endif 1721+#pragma GCC push_options 1722+#pragma GCC target("avx", "avx2") 1723+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 1724+#define BLOCK_NUM 8 1725+#define MS_SIMD_AVX 1726+ 1727+static inline int BiasAddByInnerCoreAVX(int index, const float *input, const float *bias, float *output, 1728+ int64_t num) { 1729+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1730+ SIMD_F32 vin0 = SIMD_LD_F32(input + index); 1731+ SIMD_F32 vin1 = SIMD_LD_F32(bias + index); 1732+ SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); 1733+ SIMD_ST_F32(output + index, vout); 1734+ } 1735+ return index; 1736+} 1737+ 1738+static inline int BiasAddByBatchCoreAVX(int index, const float *input, const float *bias, float *output1, 1739+ float *output2, float *output3, float *output4, int64_t num) { 1740+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1741+ SIMD_LDX4_F32(input_data, input + index, num); 1742+ SIMD_F32 bias_data = SIMD_LD_F32(bias + index); 1743+ SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data)); 1744+ SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data)); 1745+ SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data)); 1746+ SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data)); 1747+ } 1748+ return index; 1749+} 1750+ 1751+#undef MS_SIMD_INSTRUCTION 1752+#undef BLOCK_NUM 1753+#pragma GCC pop_options 1754+#undef MS_SIMD_AVX 1755+#ifdef __cplusplus 1756+}; 1757+#endif 1758+ 1759+#endif // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_ 1760diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h 1761new file mode 100644 1762index 00000000..44176549 1763--- /dev/null 1764+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h 1765@@ -0,0 +1,56 @@ 1766+/** 1767+ * Copyright 2022 Huawei Technologies Co., Ltd 1768+ * 1769+ * Licensed under the Apache License, Version 2.0 (the "License"); 1770+ * you may not use this file except in compliance with the License. 1771+ * You may obtain a copy of the License at 1772+ * 1773+ * http://www.apache.org/licenses/LICENSE-2.0 1774+ * 1775+ * Unless required by applicable law or agreed to in writing, software 1776+ * distributed under the License is distributed on an "AS IS" BASIS, 1777+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1778+ * See the License for the specific language governing permissions and 1779+ * limitations under the License. 1780+ */ 1781+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_AVX_H_ 1782+#define MINDSPORE_NNACL_BASE_CAST_BASE_AVX_H_ 1783+ 1784+#include "nnacl/intrinsics/ms_simd_instructions.h" 1785+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 1786+ 1787+#ifdef __cplusplus 1788+extern "C" { 1789+#endif 1790+#pragma GCC push_options 1791+#pragma GCC target("avx", "avx2") 1792+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 1793+#define BLOCK_NUM 8 1794+#define MS_SIMD_AVX 1795+ 1796+static inline int Int32ToFloat32AVX(int index, const int32_t *input, float *output, int number) { 1797+ for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1798+ SIMD_EPI32 value = SIMD_LD_EPI32(input + index); 1799+ SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value)); 1800+ } 1801+ return index; 1802+} 1803+ 1804+#ifndef MS_SIMD_NEON 1805+static inline int Float32ToInt32AVX(int index, const float *input, int32_t *output, int number) { 1806+ for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1807+ SIMD_F32 value = SIMD_LD_F32(input + index); 1808+ SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value)); 1809+ } 1810+ return index; 1811+} 1812+#endif 1813+ 1814+#undef MS_SIMD_INSTRUCTION 1815+#undef BLOCK_NUM 1816+#pragma GCC pop_options 1817+#undef MS_SIMD_AVX 1818+#ifdef __cplusplus 1819+} 1820+#endif 1821+#endif 1822diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h 1823new file mode 100644 1824index 00000000..dac9efa9 1825--- /dev/null 1826+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h 1827@@ -0,0 +1,70 @@ 1828+/** 1829+ * Copyright 2022 Huawei Technologies Co., Ltd 1830+ * 1831+ * Licensed under the Apache License, Version 2.0 (the "License"); 1832+ * you may not use this file except in compliance with the License. 1833+ * You may obtain a copy of the License at 1834+ * 1835+ * http://www.apache.org/licenses/LICENSE-2.0 1836+ * 1837+ * Unless required by applicable law or agreed to in writing, software 1838+ * distributed under the License is distributed on an "AS IS" BASIS, 1839+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1840+ * See the License for the specific language governing permissions and 1841+ * limitations under the License. 1842+ */ 1843+#ifndef MINDSPORE_NNACL_FP32_CDIST_AVX_H_ 1844+#define MINDSPORE_NNACL_FP32_CDIST_AVX_H_ 1845+ 1846+#include "nnacl/intrinsics/ms_simd_instructions.h" 1847+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 1848+ 1849+#ifdef __cplusplus 1850+extern "C" { 1851+#endif 1852+#pragma GCC push_options 1853+#pragma GCC target("avx", "avx2") 1854+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 1855+#define BLOCK_NUM 8 1856+#define MS_SIMD_AVX 1857+ 1858+static inline int64_t CdistTwoNormalOptAVX(int64_t index, const float *a, const float *b, 1859+ float *out, int64_t size) { 1860+ SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); 1861+ for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1862+ SIMD_F32 a_vec = SIMD_LD_F32(a + index); 1863+ SIMD_F32 b_vec = SIMD_LD_F32(b + index); 1864+ SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); 1865+ tmp_vec = SIMD_ABS_F32(tmp_vec); 1866+ result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec); 1867+ } 1868+ *out += SIMD_GET_SUM_F32(result_vec); 1869+ 1870+ return index; 1871+} 1872+ 1873+static inline int64_t CdistPNormalOptAVX(int64_t index, const float *a, const float *b, 1874+ float *out, int64_t size, float p) { 1875+ SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); 1876+ SIMD_F32 p_vec = SIMD_MOV_F32(p); 1877+ for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1878+ SIMD_F32 a_vec = SIMD_LD_F32(a + index); 1879+ SIMD_F32 b_vec = SIMD_LD_F32(b + index); 1880+ SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); 1881+ tmp_vec = SIMD_ABS_F32(tmp_vec); 1882+ tmp_vec = SIMD_POW_F32(tmp_vec, p_vec); 1883+ result_vec = SIMD_ADD_F32(tmp_vec, result_vec); 1884+ } 1885+ *out += SIMD_GET_SUM_F32(result_vec); 1886+ 1887+ return index; 1888+} 1889+ 1890+#undef MS_SIMD_INSTRUCTION 1891+#undef BLOCK_NUM 1892+#pragma GCC pop_options 1893+#undef MS_SIMD_AVX 1894+#ifdef __cplusplus 1895+} 1896+#endif 1897+#endif 1898diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h 1899new file mode 100644 1900index 00000000..7407942f 1901--- /dev/null 1902+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h 1903@@ -0,0 +1,121 @@ 1904+/** 1905+ * Copyright 2022 Huawei Technologies Co., Ltd 1906+ * 1907+ * Licensed under the Apache License, Version 2.0 (the "License"); 1908+ * you may not use this file except in compliance with the License. 1909+ * You may obtain a copy of the License at 1910+ * 1911+ * http://www.apache.org/licenses/LICENSE-2.0 1912+ * 1913+ * Unless required by applicable law or agreed to in writing, software 1914+ * distributed under the License is distributed on an "AS IS" BASIS, 1915+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1916+ * See the License for the specific language governing permissions and 1917+ * limitations under the License. 1918+ */ 1919+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_AVX_H_ 1920+#define MINDSPORE_NNACL_FP32_CUMSUM_AVX_H_ 1921+ 1922+#include "nnacl/intrinsics/ms_simd_instructions.h" 1923+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 1924+ 1925+#ifdef __cplusplus 1926+extern "C" { 1927+#endif 1928+#pragma GCC push_options 1929+#pragma GCC target("avx", "avx2") 1930+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 1931+#define BLOCK_NUM 8 1932+#define MS_SIMD_AVX 1933+ 1934+// (a, b, c) -> (a, a+b, a+b+c) exclusive == false 1935+// (a, b, c) -> (0, a, a+b) exclusive == true 1936+static inline int64_t CumsumOutputInitWithInputAVX(int64_t index, const float *layer_input, 1937+ float *layer_output, int inner_dim) { 1938+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1939+ SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index)); 1940+ } 1941+ return index; 1942+} 1943+ 1944+static inline int64_t CumsumOutputInitWithZeroAVX(int64_t index, float *layer_output, int inner_dim) { 1945+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1946+ SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f)); 1947+ } 1948+ return index; 1949+} 1950+ 1951+static inline int64_t CumsumAVX(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output, 1952+ int inner_dim) { 1953+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1954+ SIMD_F32 input_val = SIMD_LD_F32(layer_input + index); 1955+ SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index); 1956+ SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); 1957+ SIMD_ST_F32(layer_output + index, out_val); 1958+ } 1959+ return index; 1960+} 1961+ 1962+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false 1963+// (a, b, c) -> (c+b, c, 0) exclusive==true 1964+static inline int64_t CumsumReverseAVX(int64_t index, const float *layer_input, float *layer_output, 1965+ float *layer_last_output, int inner_dim) { 1966+ 1967+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1968+ SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1); 1969+ SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1); 1970+ SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); 1971+ SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val); 1972+ } 1973+ return index; 1974+} 1975+ 1976+// (a, b, c) -> (a, a+b, a+b+c) exclusive == false 1977+// (a, b, c) -> (0, a, a+b) exclusive == true 1978+static inline int64_t CumsumIntOutputInitWithInputAVX(int64_t index, const int *layer_input, 1979+ int *layer_output, int inner_dim) { 1980+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1981+ SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index)); 1982+ } 1983+ return index; 1984+} 1985+ 1986+static inline int64_t CumsumIntOutputInitWithZeroAVX(int64_t index, int *layer_output, int inner_dim) { 1987+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1988+ SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f)); 1989+ } 1990+ return index; 1991+} 1992+ 1993+static inline int64_t CumsumIntAVX(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, 1994+ int inner_dim) { 1995+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 1996+ SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index); 1997+ SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index); 1998+ SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); 1999+ SIMD_ST_EPI32(layer_output + index, out_val); 2000+ } 2001+ return index; 2002+} 2003+ 2004+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false 2005+// (a, b, c) -> (c+b, c, 0) exclusive==true 2006+static inline int64_t CumsumReverseIntAVX(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, 2007+ int inner_dim) { 2008+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2009+ SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1); 2010+ SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1); 2011+ SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); 2012+ SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val); 2013+ } 2014+ return index; 2015+} 2016+ 2017+#undef MS_SIMD_INSTRUCTION 2018+#undef BLOCK_NUM 2019+#pragma GCC pop_options 2020+#undef MS_SIMD_AVX 2021+#ifdef __cplusplus 2022+} 2023+#endif 2024+#endif 2025diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h 2026new file mode 100644 2027index 00000000..3710151e 2028--- /dev/null 2029+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h 2030@@ -0,0 +1,167 @@ 2031+/** 2032+ * Copyright 2022 Huawei Technologies Co., Ltd 2033+ * 2034+ * Licensed under the Apache License, Version 2.0 (the "License"); 2035+ * you may not use this file except in compliance with the License. 2036+ * You may obtain a copy of the License at 2037+ * 2038+ * http://www.apache.org/licenses/LICENSE-2.0 2039+ * 2040+ * Unless required by applicable law or agreed to in writing, software 2041+ * distributed under the License is distributed on an "AS IS" BASIS, 2042+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2043+ * See the License for the specific language governing permissions and 2044+ * limitations under the License. 2045+ */ 2046+ 2047+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX_H_ 2048+#define MINDSPORE_NNACL_FP32_DIV_AVX_H_ 2049+ 2050+#include "nnacl/intrinsics/ms_simd_instructions.h" 2051+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2052+ 2053+#ifdef __cplusplus 2054+extern "C" { 2055+#endif 2056+#pragma GCC push_options 2057+#pragma GCC target("avx", "avx2") 2058+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2059+#define BLOCK_NUM 8 2060+#define MS_SIMD_AVX 2061+ 2062+static inline int ElementOptDivNum0AVX(int index, const float *in0, const float *in1, float *out, 2063+ int size) { 2064+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 2065+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2066+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2067+ SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1); 2068+ SIMD_ST_F32(out + index, vout); 2069+ } 2070+ return index; 2071+} 2072+ 2073+static inline int ElementOptDivNum1AVX(int index, const float *in0, const float *in1, float *out, 2074+ int size) { 2075+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 2076+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2077+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2078+ SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_); 2079+ SIMD_ST_F32(out + index, vout); 2080+ } 2081+ return index; 2082+} 2083+ 2084+static inline int ElementOptDivIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { 2085+ SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); 2086+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2087+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 2088+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1); 2089+ SIMD_ST_EPI32(out + index, vout); 2090+ } 2091+ return index; 2092+} 2093+ 2094+static inline int ElementOptDivIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { 2095+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 2096+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2097+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 2098+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_); 2099+ SIMD_ST_EPI32(out + index, vout); 2100+ } 2101+ return index; 2102+} 2103+ 2104+static inline int ElementOptDivReluNum0AVX(int index, const float *in0, const float *in1, float *out, 2105+ int size) { 2106+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 2107+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2108+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2109+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f); 2110+ SIMD_ST_F32(out + index, vout); 2111+ } 2112+ return index; 2113+} 2114+ 2115+static inline int ElementOptDivReluNum1AVX(int index, const float *in0, const float *in1, float *out, 2116+ int size) { 2117+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 2118+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2119+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2120+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f); 2121+ SIMD_ST_F32(out + index, vout); 2122+ } 2123+ return index; 2124+} 2125+ 2126+static inline int ElementOptDivRelu6Num0AVX(int index, const float *in0, const float *in1, float *out, 2127+ int size) { 2128+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 2129+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2130+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2131+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f); 2132+ SIMD_ST_F32(out + index, vout); 2133+ } 2134+ return index; 2135+} 2136+ 2137+static inline int ElementOptDivRelu6Num1AVX(int index, const float *in0, const float *in1, float *out, 2138+ int size) { 2139+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 2140+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2141+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2142+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f); 2143+ SIMD_ST_F32(out + index, vout); 2144+ } 2145+ return index; 2146+} 2147+ 2148+static inline int ElementDivAVX(int index, const float *in0, const float *in1, float *out, int size) { 2149+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2150+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2151+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2152+ SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1); 2153+ SIMD_ST_F32(out + index, vout); 2154+ } 2155+ return index; 2156+} 2157+ 2158+static inline int ElementDivIntAVX(int index, const int *in0, const int *in1, int *out, int size) { 2159+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2160+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 2161+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 2162+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1); 2163+ SIMD_ST_EPI32(out + index, vout); 2164+ } 2165+ return index; 2166+} 2167+ 2168+static inline int ElementDivReluAVX(int index, const float *in0, const float *in1, float *out, 2169+ int size) { 2170+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2171+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2172+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2173+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f); 2174+ SIMD_ST_F32(out + index, vout); 2175+ } 2176+ return index; 2177+} 2178+ 2179+static inline int ElementDivRelu6AVX(int index, const float *in0, const float *in1, float *out, 2180+ int size) { 2181+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2182+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2183+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2184+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f); 2185+ SIMD_ST_F32(out + index, vout); 2186+ } 2187+ return index; 2188+} 2189+ 2190+#undef MS_SIMD_INSTRUCTION 2191+#undef BLOCK_NUM 2192+#pragma GCC pop_options 2193+#undef MS_SIMD_AVX 2194+#ifdef __cplusplus 2195+}; 2196+#endif 2197+#endif 2198diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h 2199new file mode 100644 2200index 00000000..cbd4eca5 2201--- /dev/null 2202+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h 2203@@ -0,0 +1,46 @@ 2204+/** 2205+ * Copyright 2022 Huawei Technologies Co., Ltd 2206+ * 2207+ * Licensed under the Apache License, Version 2.0 (the "License"); 2208+ * you may not use this file except in compliance with the License. 2209+ * You may obtain a copy of the License at 2210+ * 2211+ * http://www.apache.org/licenses/LICENSE-2.0 2212+ * 2213+ * Unless required by applicable law or agreed to in writing, software 2214+ * distributed under the License is distributed on an "AS IS" BASIS, 2215+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2216+ * See the License for the specific language governing permissions and 2217+ * limitations under the License. 2218+ */ 2219+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX_H_ 2220+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX_H_ 2221+ 2222+#include "nnacl/intrinsics/ms_simd_instructions.h" 2223+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2224+ 2225+#ifdef __cplusplus 2226+extern "C" { 2227+#endif 2228+#pragma GCC push_options 2229+#pragma GCC target("avx", "avx2") 2230+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2231+#define BLOCK_NUM 8 2232+#define MS_SIMD_AVX 2233+ 2234+static inline int DropoutFp32AVX(int index, const float *input, float scale, 2235+ int length, float *output) { 2236+ SIMD_F32 scale_value = SIMD_MOV_F32(scale); 2237+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2238+ SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value)); 2239+ } 2240+ return index; 2241+} 2242+#undef MS_SIMD_INSTRUCTION 2243+#undef BLOCK_NUM 2244+#pragma GCC pop_options 2245+#undef MS_SIMD_AVX 2246+#ifdef __cplusplus 2247+} 2248+#endif 2249+#endif 2250diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h 2251new file mode 100644 2252index 00000000..cf7cbd37 2253--- /dev/null 2254+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h 2255@@ -0,0 +1,63 @@ 2256+/** 2257+ * Copyright 2022 Huawei Technologies Co., Ltd 2258+ * 2259+ * Licensed under the Apache License, Version 2.0 (the "License"); 2260+ * you may not use this file except in compliance with the License. 2261+ * You may obtain a copy of the License at 2262+ * 2263+ * http://www.apache.org/licenses/LICENSE-2.0 2264+ * 2265+ * Unless required by applicable law or agreed to in writing, software 2266+ * distributed under the License is distributed on an "AS IS" BASIS, 2267+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2268+ * See the License for the specific language governing permissions and 2269+ * limitations under the License. 2270+ */ 2271+ 2272+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX_H_ 2273+#define MINDSPORE_NNACL_FP32_DIV_AVX_H_ 2274+ 2275+#include "nnacl/intrinsics/ms_simd_instructions.h" 2276+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2277+ 2278+#ifdef __cplusplus 2279+extern "C" { 2280+#endif 2281+#pragma GCC push_options 2282+#pragma GCC target("avx", "avx2") 2283+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2284+#define BLOCK_NUM 8 2285+#define MS_SIMD_AVX 2286+ 2287+static inline int64_t ExpFp32AVX(int64_t index, const float *src, float *dst, int num) { 2288+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2289+ SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); 2290+ } 2291+ return index; 2292+} 2293+ 2294+static inline int64_t ExpFp32WithInScaleAVX(int64_t index, const float *src, float *dst, int num, float in_scale) { 2295+ SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale); 2296+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2297+ SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index); 2298+ } 2299+ return index; 2300+} 2301+ 2302+static inline int64_t ExpFp32WithOutScaleAVX(int64_t index, const float *src, float *dst, int num, float out_scale) { 2303+ SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale); 2304+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2305+ SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); 2306+ SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec)); 2307+ } 2308+ return index; 2309+} 2310+ 2311+#undef MS_SIMD_INSTRUCTION 2312+#undef BLOCK_NUM 2313+#pragma GCC pop_options 2314+#undef MS_SIMD_AVX 2315+#ifdef __cplusplus 2316+}; 2317+#endif 2318+#endif 2319diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h 2320new file mode 100644 2321index 00000000..8b01844e 2322--- /dev/null 2323+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h 2324@@ -0,0 +1,53 @@ 2325+/** 2326+ * Copyright 2022 Huawei Technologies Co., Ltd 2327+ * 2328+ * Licensed under the Apache License, Version 2.0 (the "License"); 2329+ * you may not use this file except in compliance with the License. 2330+ * You may obtain a copy of the License at 2331+ * 2332+ * http://www.apache.org/licenses/LICENSE-2.0 2333+ * 2334+ * Unless required by applicable law or agreed to in writing, software 2335+ * distributed under the License is distributed on an "AS IS" BASIS, 2336+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2337+ * See the License for the specific language governing permissions and 2338+ * limitations under the License. 2339+ */ 2340+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_AVX_H_ 2341+#define MINDSPORE_NNACL_BASE_FILL_BASE_AVX_H_ 2342+ 2343+#include "nnacl/intrinsics/ms_simd_instructions.h" 2344+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2345+ 2346+#ifdef __cplusplus 2347+extern "C" { 2348+#endif 2349+#pragma GCC push_options 2350+#pragma GCC target("avx", "avx2") 2351+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2352+#define BLOCK_NUM 8 2353+#define MS_SIMD_AVX 2354+ 2355+static inline int FillFp32AVX(int index, float *output, int size, float data) { 2356+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2357+ SIMD_ST_F32(output + index, SIMD_MOV_F32(data)); 2358+ } 2359+ return index; 2360+} 2361+ 2362+static inline int FillInt32AVX(int index, int *output, int size, int data) { 2363+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2364+ SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data)); 2365+ } 2366+ return index; 2367+} 2368+ 2369+#undef MS_SIMD_INSTRUCTION 2370+#undef BLOCK_NUM 2371+#pragma GCC pop_options 2372+#undef MS_SIMD_AVX 2373+#ifdef __cplusplus 2374+} 2375+#endif 2376+#endif 2377+ 2378diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h 2379new file mode 100644 2380index 00000000..d5076e59 2381--- /dev/null 2382+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h 2383@@ -0,0 +1,77 @@ 2384+/** 2385+ * Copyright 2022 Huawei Technologies Co., Ltd 2386+ * 2387+ * Licensed under the Apache License, Version 2.0 (the "License"); 2388+ * you may not use this file except in compliance with the License. 2389+ * You may obtain a copy of the License at 2390+ * 2391+ * http://www.apache.org/licenses/LICENSE-2.0 2392+ * 2393+ * Unless required by applicable law or agreed to in writing, software 2394+ * distributed under the License is distributed on an "AS IS" BASIS, 2395+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2396+ * See the License for the specific language governing permissions and 2397+ * limitations under the License. 2398+ */ 2399+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX_H_ 2400+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX_H_ 2401+ 2402+#include "nnacl/intrinsics/ms_simd_instructions.h" 2403+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2404+ 2405+#ifdef __cplusplus 2406+extern "C" { 2407+#endif 2408+#pragma GCC push_options 2409+#pragma GCC target("avx", "avx2") 2410+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2411+#define BLOCK_NUM 8 2412+#define MS_SIMD_AVX 2413+ 2414+static inline int64_t GroupNormFp32AVX(int64_t index, const float *unit_input, float scale, float offset, float mean, 2415+ float var_sqrt, int unit, float *unit_output) { 2416+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 2417+ SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt); 2418+ SIMD_F32 scale_val = SIMD_MOV_F32(scale); 2419+ SIMD_F32 offset_val = SIMD_MOV_F32(offset); 2420+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2421+ SIMD_F32 input = SIMD_LD_F32(unit_input + index); 2422+ SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt); 2423+ SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val); 2424+ SIMD_ST_F32(unit_output + index, output); 2425+ } 2426+ return index; 2427+} 2428+ 2429+static inline int64_t GroupNormReduceSumAVX(int64_t index, const float *in, float *sum, int unit) { 2430+ if (unit - index >= 4 * BLOCK_NUM) { 2431+ SIMD_F32 tmp = SIMD_MOV_F32(0); 2432+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2433+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index)); 2434+ } 2435+ *sum += SIMD_GET_SUM_F32(tmp); 2436+ } 2437+ return index; 2438+} 2439+ 2440+static inline int64_t GroupNormReduceVarAVX(int64_t index, const float *in, float mean, float *sum, int unit) { 2441+ if (unit - index >= 4 * BLOCK_NUM) { 2442+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 2443+ SIMD_F32 tmp = SIMD_MOV_F32(0); 2444+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2445+ SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val); 2446+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input)); 2447+ } 2448+ *sum += SIMD_GET_SUM_F32(tmp); 2449+ } 2450+ return index; 2451+} 2452+ 2453+#undef MS_SIMD_INSTRUCTION 2454+#undef BLOCK_NUM 2455+#pragma GCC pop_options 2456+#undef MS_SIMD_AVX 2457+#ifdef __cplusplus 2458+} 2459+#endif 2460+#endif 2461diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h 2462new file mode 100644 2463index 00000000..96fdf185 2464--- /dev/null 2465+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h 2466@@ -0,0 +1,68 @@ 2467+/** 2468+ * Copyright 2022 Huawei Technologies Co., Ltd 2469+ * 2470+ * Licensed under the Apache License, Version 2.0 (the "License"); 2471+ * you may not use this file except in compliance with the License. 2472+ * You may obtain a copy of the License at 2473+ * 2474+ * http://www.apache.org/licenses/LICENSE-2.0 2475+ * 2476+ * Unless required by applicable law or agreed to in writing, software 2477+ * distributed under the License is distributed on an "AS IS" BASIS, 2478+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2479+ * See the License for the specific language governing permissions and 2480+ * limitations under the License. 2481+ */ 2482+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX_H_ 2483+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX_H_ 2484+ 2485+#include "nnacl/intrinsics/ms_simd_instructions.h" 2486+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2487+ 2488+#ifdef __cplusplus 2489+extern "C" { 2490+#endif 2491+#pragma GCC push_options 2492+#pragma GCC target("avx", "avx2") 2493+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2494+#define BLOCK_NUM 8 2495+#define MS_SIMD_AVX 2496+ 2497+static inline int LayerNormMeanAndSquareAVX(int index, const float *src, int num, float *mean, float *square_mean) { 2498+ if (num >= 4 * BLOCK_NUM) { 2499+ SIMD_F32 sum_val = SIMD_SET0_F32; 2500+ SIMD_F32 square_sum_val = SIMD_SET0_F32; 2501+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2502+ SIMD_F32 value = SIMD_LD_F32(src + index); 2503+ SIMD_F32 square_value = SIMD_MUL_F32(value, value); 2504+ sum_val = SIMD_ADD_F32(sum_val, value); 2505+ square_sum_val = SIMD_ADD_F32(square_sum_val, square_value); 2506+ } 2507+ *mean += SIMD_GET_SUM_F32(sum_val); 2508+ *square_mean += SIMD_GET_SUM_F32(square_sum_val); 2509+ } 2510+ return index; 2511+} 2512+ 2513+static inline int LayerNormGammaAndBetaAVX(int index, float *dst, const float *src, const float *gamma_data, 2514+ const float *beta_data, int num, const float mean, const float deno) { 2515+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 2516+ SIMD_F32 deno_val = SIMD_MOV_F32(deno); 2517+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2518+ SIMD_F32 value = SIMD_LD_F32(src + index); 2519+ SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val); 2520+ out_value = SIMD_MUL_F32(out_value, deno_val); 2521+ out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index)); 2522+ SIMD_ST_F32(dst + index, out_value); 2523+ } 2524+ return index; 2525+} 2526+ 2527+#undef MS_SIMD_INSTRUCTION 2528+#undef BLOCK_NUM 2529+#pragma GCC pop_options 2530+#undef MS_SIMD_AVX 2531+#ifdef __cplusplus 2532+} 2533+#endif 2534+#endif 2535diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h 2536new file mode 100644 2537index 00000000..523e120e 2538--- /dev/null 2539+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h 2540@@ -0,0 +1,93 @@ 2541+/** 2542+ * Copyright 2022 Huawei Technologies Co., Ltd 2543+ * 2544+ * Licensed under the Apache License, Version 2.0 (the "License"); 2545+ * you may not use this file except in compliance with the License. 2546+ * You may obtain a copy of the License at 2547+ * 2548+ * http://www.apache.org/licenses/LICENSE-2.0 2549+ * 2550+ * Unless required by applicable law or agreed to in writing, software 2551+ * distributed under the License is distributed on an "AS IS" BASIS, 2552+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2553+ * See the License for the specific language governing permissions and 2554+ * limitations under the License. 2555+ */ 2556+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_AVX_H_ 2557+#define MINDSPORE_NNACL_FP32_MATMUL_F32_AVX_H_ 2558+ 2559+#include "nnacl/intrinsics/ms_simd_instructions.h" 2560+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2561+ 2562+#ifdef __cplusplus 2563+extern "C" { 2564+#endif 2565+#pragma GCC push_options 2566+#pragma GCC target("avx", "avx2") 2567+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2568+#define BLOCK_NUM 8 2569+#define MS_SIMD_AVX 2570+ 2571+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6. 2572+static inline int64_t GemmIsNotPackAVX(int64_t index, const float *a, const float *b, float *c, const float *bias, int row, 2573+ int deep, int act_type) { 2574+ SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f); 2575+ SIMD_F32 up_threshold = SIMD_MOV_F32(6); 2576+ SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]); 2577+ SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]); 2578+ for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2579+ SIMD_F32 a_data = SIMD_LD_F32(a + index); 2580+ SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16); 2581+ if (act_type != 0) { 2582+ dst = SIMD_MAX_F32(dst, down_threshold); 2583+ if (act_type == 3) { 2584+ dst = SIMD_MIN_F32(dst, up_threshold); 2585+ } 2586+ } 2587+ SIMD_ST_F32(c + index, dst); 2588+ } 2589+ 2590+ return index; 2591+} 2592+ 2593+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX) 2594+static inline int64_t GemmIsNotPackOptimizeCoreAVX(int64_t index, const float *a, const float *b, int k, float *dst) { 2595+ SIMD_F32 dst1 = SIMD_MOV_F32(0.0f); 2596+ for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2597+ SIMD_F32 weight = SIMD_LD_F32(b + index); 2598+ SIMD_F32 a1 = SIMD_LD_F32(a + index); 2599+ dst1 = SIMD_FMADD_F32(weight, a1, dst1); 2600+ } 2601+ *dst += SIMD_REDUCE_ADD_F32(dst1); 2602+ return index; 2603+} 2604+#endif 2605+ 2606+static inline int64_t MatVecMulNoPackCoreAVX(int64_t oc_index, const float *a, const float *b, float *c, const float *bias, 2607+ int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) { 2608+ for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) { 2609+ SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index)); 2610+ for (int64_t k = 0; k < depth; ++k) { 2611+ SIMD_F32 left = SIMD_MOV_F32(a[k]); 2612+ SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col); 2613+ out = SIMD_FMADD_F32(left, right, out); 2614+ } 2615+ if ((inc_flag & 0x2) != 0 && act_type != 0) { 2616+ out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f)); 2617+ if (act_type == 0x3) { 2618+ out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f)); 2619+ } 2620+ } 2621+ SIMD_ST_F32(c + oc_index, out); 2622+ } 2623+ return oc_index; 2624+} 2625+ 2626+#undef MS_SIMD_INSTRUCTION 2627+#undef BLOCK_NUM 2628+#pragma GCC pop_options 2629+#undef MS_SIMD_AVX 2630+#ifdef __cplusplus 2631+} 2632+#endif 2633+#endif 2634diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h 2635new file mode 100644 2636index 00000000..a5d8b0a0 2637--- /dev/null 2638+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h 2639@@ -0,0 +1,218 @@ 2640+/** 2641+ * Copyright 2022 Huawei Technologies Co., Ltd 2642+ * 2643+ * Licensed under the Apache License, Version 2.0 (the "License"); 2644+ * you may not use this file except in compliance with the License. 2645+ * You may obtain a copy of the License at 2646+ * 2647+ * http://www.apache.org/licenses/LICENSE-2.0 2648+ * 2649+ * Unless required by applicable law or agreed to in writing, software 2650+ * distributed under the License is distributed on an "AS IS" BASIS, 2651+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2652+ * See the License for the specific language governing permissions and 2653+ * limitations under the License. 2654+ */ 2655+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ 2656+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ 2657+ 2658+#include "nnacl/intrinsics/ms_simd_instructions.h" 2659+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2660+ 2661+#ifdef __cplusplus 2662+extern "C" { 2663+#endif 2664+#pragma GCC push_options 2665+#pragma GCC target("avx", "avx2") 2666+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2667+#define BLOCK_NUM 8 2668+#define MS_SIMD_AVX 2669+ 2670+static inline int ElementMulAVX(int index, const float *in0, const float *in1, float *out, int size) { 2671+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2672+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2673+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2674+ SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1); 2675+ SIMD_ST_F32(out + index, vout); 2676+ } 2677+ return index; 2678+} 2679+ 2680+static inline int ElementMulReluAVX(int index, const float *in0, const float *in1, float *out, int size) { 2681+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2682+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2683+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2684+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f); 2685+ SIMD_ST_F32(out + index, vout); 2686+ } 2687+ return index; 2688+} 2689+ 2690+static inline int ElementMulRelu6AVX(int index, const float *in0, const float *in1, float *out, int size) { 2691+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2692+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2693+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2694+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f); 2695+ SIMD_ST_F32(out + index, vout); 2696+ } 2697+ return index; 2698+} 2699+ 2700+static inline int ElementMulIntAVX(int index, const int *in0, const int *in1, int *out, int size) { 2701+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2702+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 2703+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 2704+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1); 2705+ SIMD_ST_EPI32(out + index, vout); 2706+ } 2707+ return index; 2708+} 2709+ 2710+static inline int ElementMulReluIntAVX(int index, const int *in0, const int *in1, int *out, int size) { 2711+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2712+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 2713+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 2714+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f); 2715+ SIMD_ST_EPI32(out + index, vout); 2716+ } 2717+ return index; 2718+} 2719+ 2720+static inline int ElementMulRelu6IntAVX(int index, const int *in0, const int *in1, int *out, int size) { 2721+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2722+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 2723+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 2724+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f); 2725+ SIMD_ST_EPI32(out + index, vout); 2726+ } 2727+ return index; 2728+} 2729+ 2730+static inline int ElementOptMulNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { 2731+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 2732+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2733+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2734+ SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1); 2735+ SIMD_ST_F32(out + index, vout); 2736+ } 2737+ return index; 2738+} 2739+ 2740+static inline int ElementOptMulNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { 2741+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 2742+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2743+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2744+ SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_); 2745+ SIMD_ST_F32(out + index, vout); 2746+ } 2747+ return index; 2748+} 2749+ 2750+static inline int ElementOptMulReluNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { 2751+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 2752+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2753+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2754+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f); 2755+ SIMD_ST_F32(out + index, vout); 2756+ } 2757+ return index; 2758+} 2759+ 2760+static inline int ElementOptMulReluNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { 2761+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 2762+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2763+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2764+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f); 2765+ SIMD_ST_F32(out + index, vout); 2766+ } 2767+ return index; 2768+} 2769+ 2770+static inline int ElementOptMulRelu6Num0AVX(int index, const float *in0, const float *in1, float *out, int size) { 2771+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 2772+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2773+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 2774+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f); 2775+ SIMD_ST_F32(out + index, vout); 2776+ } 2777+ return index; 2778+} 2779+ 2780+static inline int ElementOptMulRelu6Num1AVX(int index, const float *in0, const float *in1, float *out, int size) { 2781+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 2782+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2783+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 2784+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f); 2785+ SIMD_ST_F32(out + index, vout); 2786+ } 2787+ return index; 2788+} 2789+ 2790+static inline int ElementOptMulIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { 2791+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 2792+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2793+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 2794+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1); 2795+ SIMD_ST_EPI32(out + index, vout); 2796+ } 2797+ return index; 2798+} 2799+ 2800+static inline int ElementOptMulIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { 2801+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 2802+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2803+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 2804+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_); 2805+ SIMD_ST_EPI32(out + index, vout); 2806+ } 2807+ return index; 2808+} 2809+ 2810+static inline int ElementOptMulReluIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { 2811+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 2812+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2813+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 2814+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f); 2815+ SIMD_ST_EPI32(out + index, vout); 2816+ } 2817+ return index; 2818+} 2819+ 2820+static inline int ElementOptMulReluIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { 2821+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 2822+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2823+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 2824+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f); 2825+ SIMD_ST_EPI32(out + index, vout); 2826+ } 2827+ return index; 2828+} 2829+ 2830+static inline int ElementOptMulRelu6IntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { 2831+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 2832+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2833+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 2834+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f); 2835+ SIMD_ST_EPI32(out + index, vout); 2836+ } 2837+ return index; 2838+} 2839+ 2840+static inline int ElementOptMulRelu6IntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { 2841+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 2842+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2843+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 2844+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f); 2845+ SIMD_ST_EPI32(out + index, vout); 2846+ } 2847+ return index; 2848+} 2849+ 2850+#undef MS_SIMD_INSTRUCTION 2851+#undef BLOCK_NUM 2852+#pragma GCC pop_options 2853+#undef MS_SIMD_AVX 2854+#ifdef __cplusplus 2855+} 2856+#endif 2857+#endif 2858diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h 2859new file mode 100644 2860index 00000000..d4bd2305 2861--- /dev/null 2862+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h 2863@@ -0,0 +1,84 @@ 2864+/** 2865+ * Copyright 2022 Huawei Technologies Co., Ltd 2866+ * 2867+ * Licensed under the Apache License, Version 2.0 (the "License"); 2868+ * you may not use this file except in compliance with the License. 2869+ * You may obtain a copy of the License at 2870+ * 2871+ * http://www.apache.org/licenses/LICENSE-2.0 2872+ * 2873+ * Unless required by applicable law or agreed to in writing, software 2874+ * distributed under the License is distributed on an "AS IS" BASIS, 2875+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2876+ * See the License for the specific language governing permissions and 2877+ * limitations under the License. 2878+ */ 2879+#ifndef MINDSPORE_NNACL_FP32_POOLING_AVX_H_ 2880+#define MINDSPORE_NNACL_FP32_POOLING_AVX_H_ 2881+ 2882+#include "nnacl/intrinsics/ms_simd_instructions.h" 2883+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2884+ 2885+#ifdef __cplusplus 2886+extern "C" { 2887+#endif 2888+#pragma GCC push_options 2889+#pragma GCC target("avx", "avx2") 2890+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2891+#define BLOCK_NUM 8 2892+#define MS_SIMD_AVX 2893+ 2894+static inline int AvgPoolingBatchAVX(int ci, const float *src_plane_ptr, int channel, 2895+ float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, 2896+ int in_h_index, int in_w, int in_w_index, float minf, float maxf) { 2897+ SIMD_F32 min_val = SIMD_MOV_F32(minf); 2898+ SIMD_F32 max_val = SIMD_MOV_F32(maxf); 2899+ for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { 2900+ const float *src_c_ptr = src_plane_ptr + ci; 2901+ float *dst_c_ptr = dst_plane_ptr + ci; 2902+ SIMD_F32 tmp_avg = SIMD_SET0_F32; 2903+ int real_count = 0; 2904+ for (int h = real_win_h_start; h < real_win_h_end; h++) { 2905+ for (int w = real_win_w_start; w < real_win_w_end; w++) { 2906+ const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; 2907+ tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr)); 2908+ ++real_count; 2909+ } 2910+ } 2911+ tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count)); 2912+ tmp_avg = SIMD_MAX_F32(tmp_avg, min_val); 2913+ tmp_avg = SIMD_MIN_F32(tmp_avg, max_val); 2914+ SIMD_ST_F32(dst_c_ptr, tmp_avg); 2915+ } 2916+ return ci; 2917+} 2918+ 2919+static inline int MaxPoolingBatchAVX(int ci, const float *src_plane_ptr, int channel, 2920+ float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, 2921+ int in_h_index, int in_w, int in_w_index, float minf, float maxf) { 2922+ SIMD_F32 min_val = SIMD_MOV_F32(minf); 2923+ SIMD_F32 max_val = SIMD_MOV_F32(maxf); 2924+ for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { 2925+ const float *src_c_ptr = src_plane_ptr + ci; 2926+ float *dst_c_ptr = dst_plane_ptr + ci; 2927+ SIMD_F32 tmp_max = min_val; 2928+ for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { 2929+ for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { 2930+ const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; 2931+ tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr)); 2932+ } 2933+ } 2934+ tmp_max = SIMD_MIN_F32(tmp_max, max_val); 2935+ SIMD_ST_F32(dst_c_ptr, tmp_max); 2936+ } 2937+ return ci; 2938+} 2939+ 2940+#undef MS_SIMD_INSTRUCTION 2941+#undef BLOCK_NUM 2942+#pragma GCC pop_options 2943+#undef MS_SIMD_AVX 2944+#ifdef __cplusplus 2945+} 2946+#endif 2947+#endif 2948diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h 2949new file mode 100644 2950index 00000000..2ada6cb3 2951--- /dev/null 2952+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h 2953@@ -0,0 +1,101 @@ 2954+/** 2955+ * Copyright 2022 Huawei Technologies Co., Ltd 2956+ * 2957+ * Licensed under the Apache License, Version 2.0 (the "License"); 2958+ * you may not use this file except in compliance with the License. 2959+ * You may obtain a copy of the License at 2960+ * 2961+ * http://www.apache.org/licenses/LICENSE-2.0 2962+ * 2963+ * Unless required by applicable law or agreed to in writing, software 2964+ * distributed under the License is distributed on an "AS IS" BASIS, 2965+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 2966+ * See the License for the specific language governing permissions and 2967+ * limitations under the License. 2968+ */ 2969+#ifndef MINDSPORE_NNACL_FP32_POWER_AVX_H_ 2970+#define MINDSPORE_NNACL_FP32_POWER_AVX_H_ 2971+ 2972+#include "nnacl/intrinsics/ms_simd_instructions.h" 2973+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 2974+ 2975+#ifdef __cplusplus 2976+extern "C" { 2977+#endif 2978+#pragma GCC push_options 2979+#pragma GCC target("avx", "avx2") 2980+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 2981+#define BLOCK_NUM 8 2982+#define MS_SIMD_AVX 2983+ 2984+static inline int PowerBroadCastIntExponentAVX(int index, const float *input, int exponent, float *output, int len, 2985+ float scale, float shift) { 2986+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 2987+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 2988+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 2989+ SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 2990+ SIMD_F32 result = SIMD_MOV_F32(1.0f); 2991+ int exp = abs(exponent); 2992+ while (exp) { 2993+ if (exp % 2) { 2994+ result = SIMD_MUL_F32(result, tmp); 2995+ } 2996+ tmp = SIMD_MUL_SQUARE_F32(tmp); 2997+ exp = exp / 2; 2998+ } 2999+ SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result)); 3000+ } 3001+ return index; 3002+} 3003+ 3004+static inline int PowerBroadCastFloatExponentAVX(int index, const float *input, float exponent, float *output, int len, 3005+ float scale, float shift) { 3006+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 3007+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 3008+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3009+ SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 3010+ SIMD_F32 result; 3011+ for (int i = 0; i < BLOCK_NUM; ++i) { 3012+ SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent); 3013+ } 3014+ SIMD_ST_F32(output + index, result); 3015+ } 3016+ return index; 3017+} 3018+ 3019+static inline int PowerSingleExponentAVX(int index, const float *input, const float *exponent, float *output, int len, 3020+ float scale, float shift) { 3021+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 3022+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 3023+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3024+ SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 3025+ for (int j = 0; j < BLOCK_NUM; ++j) { 3026+ float cur_exponent = exponent[index + j]; 3027+ float cur_val = SIMD_F32_GETI(tmp_vec, j); 3028+ if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) { 3029+ int exp = abs((int)(cur_exponent)); 3030+ float result = 1; 3031+ while (exp) { 3032+ if (exp % 2) { 3033+ result *= cur_val; 3034+ } 3035+ cur_val *= cur_val; 3036+ exp = exp / 2; 3037+ } 3038+ output[index + j] = *exponent >= 0 ? result : 1 / result; 3039+ } else { 3040+ output[index + j] = powf(cur_val, cur_exponent); 3041+ } 3042+ } 3043+ } 3044+ return index; 3045+} 3046+ 3047+#undef MS_SIMD_INSTRUCTION 3048+#undef BLOCK_NUM 3049+#pragma GCC pop_options 3050+#undef MS_SIMD_AVX 3051+#ifdef __cplusplus 3052+} 3053+#endif 3054+#endif 3055diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h 3056new file mode 100644 3057index 00000000..03339e42 3058--- /dev/null 3059+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h 3060@@ -0,0 +1,181 @@ 3061+/** 3062+ * Copyright 2022 Huawei Technologies Co., Ltd 3063+ * 3064+ * Licensed under the Apache License, Version 2.0 (the "License"); 3065+ * you may not use this file except in compliance with the License. 3066+ * You may obtain a copy of the License at 3067+ * 3068+ * http://www.apache.org/licenses/LICENSE-2.0 3069+ * 3070+ * Unless required by applicable law or agreed to in writing, software 3071+ * distributed under the License is distributed on an "AS IS" BASIS, 3072+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 3073+ * See the License for the specific language governing permissions and 3074+ * limitations under the License. 3075+ */ 3076+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX_H_ 3077+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX_H_ 3078+ 3079+#include "nnacl/intrinsics/ms_simd_instructions.h" 3080+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 3081+ 3082+#ifdef __cplusplus 3083+extern "C" { 3084+#endif 3085+#pragma GCC push_options 3086+#pragma GCC target("avx", "avx2") 3087+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 3088+#define BLOCK_NUM 8 3089+#define MS_SIMD_AVX 3090+ 3091+static inline int64_t ReduceSumAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 3092+ int axis_size) { 3093+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3094+ const float *inner_src = outer_src + index; 3095+ SIMD_F32 tmp = SIMD_MOV_F32(0); 3096+ for (int i = 0; i < axis_size; i++) { 3097+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 3098+ } 3099+ SIMD_ST_F32(outer_dst + index, tmp); 3100+ } 3101+ return index; 3102+} 3103+ 3104+static inline int64_t ReduceMeanAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 3105+ int axis_size) { 3106+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3107+ const float *inner_src = outer_src + index; 3108+ SIMD_F32 tmp = SIMD_MOV_F32(0); 3109+ for (int i = 0; i < axis_size; i++) { 3110+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 3111+ } 3112+ SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size)); 3113+ } 3114+ return index; 3115+} 3116+ 3117+static inline int64_t ReduceMinAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 3118+ int axis_size) { 3119+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3120+ const float *inner_src = outer_src + index; 3121+ SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX); 3122+ for (int i = 0; i < axis_size; i++) { 3123+ tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 3124+ } 3125+ SIMD_ST_F32(outer_dst + index, tmp); 3126+ } 3127+ return index; 3128+} 3129+ 3130+static inline int64_t ReduceMaxAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 3131+ int axis_size) { 3132+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3133+ const float *inner_src = outer_src + index; 3134+ SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN); 3135+ for (int i = 0; i < axis_size; i++) { 3136+ tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 3137+ } 3138+ SIMD_ST_F32(outer_dst + index, tmp); 3139+ } 3140+ return index; 3141+} 3142+ 3143+static inline int64_t ReduceProdAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 3144+ int axis_size) { 3145+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3146+ const float *inner_src = outer_src + index; 3147+ SIMD_F32 tmp = SIMD_MOV_F32(1.0f); 3148+ for (int i = 0; i < axis_size; i++) { 3149+ tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 3150+ } 3151+ SIMD_ST_F32(outer_dst + index, tmp); 3152+ } 3153+ return index; 3154+} 3155+ 3156+static inline int64_t ReduceSumSquareAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 3157+ int axis_size) { 3158+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3159+ const float *inner_src = outer_src + index; 3160+ SIMD_F32 tmp = SIMD_MOV_F32(0); 3161+ for (int i = 0; i < axis_size; i++) { 3162+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); 3163+ } 3164+ SIMD_ST_F32(outer_dst + index, tmp); 3165+ } 3166+ return index; 3167+} 3168+ 3169+static inline int64_t ReduceL2NormAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 3170+ int axis_size) { 3171+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3172+ const float *inner_src = outer_src + index; 3173+ SIMD_F32 tmp = SIMD_MOV_F32(0); 3174+ for (int i = 0; i < axis_size; i++) { 3175+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); 3176+ } 3177+ SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp)); 3178+ } 3179+ return index; 3180+} 3181+ 3182+static inline int64_t IntReduceSumAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 3183+ int axis_size) { 3184+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3185+ const int *inner_src = outer_src + index; 3186+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); 3187+ for (int i = 0; i < axis_size; i++) { 3188+ tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 3189+ } 3190+ SIMD_ST_EPI32(outer_dst + index, tmp); 3191+ } 3192+ return index; 3193+} 3194+ 3195+static inline int64_t IntReduceMeanAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 3196+ int axis_size) { 3197+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3198+ const int *inner_src = outer_src + index; 3199+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); 3200+ for (int i = 0; i < axis_size; i++) { 3201+ tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 3202+ } 3203+ SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size)); 3204+ } 3205+ return index; 3206+} 3207+ 3208+static inline int64_t IntReduceMinAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 3209+ int axis_size) { 3210+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3211+ const int *inner_src = outer_src + index; 3212+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX); 3213+ for (int i = 0; i < axis_size; i++) { 3214+ tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 3215+ } 3216+ SIMD_ST_EPI32(outer_dst + index, tmp); 3217+ } 3218+ return index; 3219+} 3220+ 3221+static inline int64_t IntReduceMaxAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 3222+ int axis_size) { 3223+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3224+ const int *inner_src = outer_src + index; 3225+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN); 3226+ for (int i = 0; i < axis_size; i++) { 3227+ tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 3228+ } 3229+ SIMD_ST_EPI32(outer_dst + index, tmp); 3230+ } 3231+ return index; 3232+} 3233+ 3234+#undef MS_SIMD_INSTRUCTION 3235+#undef BLOCK_NUM 3236+#pragma GCC pop_options 3237+#undef MS_SIMD_AVX 3238+#ifdef __cplusplus 3239+} 3240+#endif 3241+#endif 3242diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h 3243new file mode 100644 3244index 00000000..8229111d 3245--- /dev/null 3246+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h 3247@@ -0,0 +1,87 @@ 3248+/** 3249+ * Copyright 2022 Huawei Technologies Co., Ltd 3250+ * 3251+ * Licensed under the Apache License, Version 2.0 (the "License"); 3252+ * you may not use this file except in compliance with the License. 3253+ * You may obtain a copy of the License at 3254+ * 3255+ * http://www.apache.org/licenses/LICENSE-2.0 3256+ * 3257+ * Unless required by applicable law or agreed to in writing, software 3258+ * distributed under the License is distributed on an "AS IS" BASIS, 3259+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 3260+ * See the License for the specific language governing permissions and 3261+ * limitations under the License. 3262+ */ 3263+ 3264+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_AVX_H_ 3265+#define MINDSPORE_NNACL_FP32_SOFTMAX_AVX_H_ 3266+ 3267+#include "nnacl/intrinsics/ms_simd_instructions.h" 3268+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 3269+ 3270+#ifdef __cplusplus 3271+extern "C" { 3272+#endif 3273+#pragma GCC push_options 3274+#pragma GCC target("avx", "avx2") 3275+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 3276+#define BLOCK_NUM 8 3277+#define MS_SIMD_AVX 3278+ 3279+static inline int64_t SoftmaxNormGetMaxAVX(int64_t index, const float *src, int cur_batch_offset, 3280+ float *max, int channel) { 3281+ if (channel >= BLOCK_NUM * BLOCK_NUM) { 3282+ SIMD_F32 max_val = SIMD_MOV_F32(*max); 3283+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3284+ max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index)); 3285+ } 3286+ *max = SIMD_GET_MAX_F32(max_val); 3287+ } 3288+ return index; 3289+} 3290+ 3291+static inline int64_t SoftmaxNormCalcNormAVX(int64_t index, const float *src, float *dst, 3292+ int cur_batch_offset, float max, int channel) { 3293+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3294+ SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max)); 3295+ SIMD_ST_F32(dst + cur_batch_offset + index, output); 3296+ } 3297+ return index; 3298+} 3299+ 3300+static inline int64_t SoftmaxLastAxisGetExpSumAVX(int64_t index, const float *src, float *dst, 3301+ int cur_batch_offset, float max, float *exp_sum, int channel) { 3302+#ifndef _WIN32 3303+ SIMD_F32 sum_val = SIMD_SET0_F32; 3304+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3305+ SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); 3306+ SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max)); 3307+ SIMD_F32 exp_out = SIMD_EXP_F32(output); 3308+ sum_val = SIMD_ADD_F32(sum_val, exp_out); 3309+ SIMD_ST_F32(dst + cur_batch_offset + index, exp_out); 3310+ } 3311+ *exp_sum += SIMD_GET_SUM_F32(sum_val); 3312+#endif 3313+ return index; 3314+} 3315+ 3316+static inline int64_t SoftmaxLastAxisGetResultAVX(int64_t index, const float *src, float *dst, 3317+ int cur_batch_offset, float exp_sum, int channel) { 3318+ SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum); 3319+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3320+ SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); 3321+ SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val); 3322+ SIMD_ST_F32(dst + cur_batch_offset + index, output); 3323+ } 3324+ return index; 3325+} 3326+ 3327+#undef MS_SIMD_INSTRUCTION 3328+#undef BLOCK_NUM 3329+#pragma GCC pop_options 3330+#undef MS_SIMD_AVX 3331+#ifdef __cplusplus 3332+}; 3333+#endif 3334+#endif 3335diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h 3336new file mode 100644 3337index 00000000..a3ed93d4 3338--- /dev/null 3339+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h 3340@@ -0,0 +1,167 @@ 3341+/** 3342+ * Copyright 2022 Huawei Technologies Co., Ltd 3343+ * 3344+ * Licensed under the Apache License, Version 2.0 (the "License"); 3345+ * you may not use this file except in compliance with the License. 3346+ * You may obtain a copy of the License at 3347+ * 3348+ * http://www.apache.org/licenses/LICENSE-2.0 3349+ * 3350+ * Unless required by applicable law or agreed to in writing, software 3351+ * distributed under the License is distributed on an "AS IS" BASIS, 3352+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 3353+ * See the License for the specific language governing permissions and 3354+ * limitations under the License. 3355+ */ 3356+ 3357+#ifndef MINDSPORE_NNACL_FP32_SUB_AVX_H_ 3358+#define MINDSPORE_NNACL_FP32_SUB_AVX_H_ 3359+ 3360+#include "nnacl/intrinsics/ms_simd_instructions.h" 3361+#include "nnacl/intrinsics/ms_simd_avx_instructions.h" 3362+ 3363+#ifdef __cplusplus 3364+extern "C" { 3365+#endif 3366+#pragma GCC push_options 3367+#pragma GCC target("avx", "avx2") 3368+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION 3369+#define BLOCK_NUM 8 3370+#define MS_SIMD_AVX 3371+ 3372+static inline int ElementOptSubNum0AVX(int index, const float *in0, const float *in1, float *out, 3373+ int size) { 3374+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 3375+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3376+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 3377+ SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1); 3378+ SIMD_ST_F32(out + index, vout); 3379+ } 3380+ return index; 3381+} 3382+ 3383+static inline int ElementOptSubNum1AVX(int index, const float *in0, const float *in1, float *out, 3384+ int size) { 3385+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 3386+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3387+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 3388+ SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_); 3389+ SIMD_ST_F32(out + index, vout); 3390+ } 3391+ return index; 3392+} 3393+ 3394+static inline int ElementOptSubIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { 3395+ SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); 3396+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3397+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 3398+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1); 3399+ SIMD_ST_EPI32(out + index, vout); 3400+ } 3401+ return index; 3402+} 3403+ 3404+static inline int ElementOptSubIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { 3405+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 3406+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3407+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 3408+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_); 3409+ SIMD_ST_EPI32(out + index, vout); 3410+ } 3411+ return index; 3412+} 3413+ 3414+static inline int ElementOptSubReluNum0AVX(int index, const float *in0, const float *in1, float *out, 3415+ int size) { 3416+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 3417+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3418+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 3419+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f); 3420+ SIMD_ST_F32(out + index, vout); 3421+ } 3422+ return index; 3423+} 3424+ 3425+static inline int ElementOptSubReluNum1AVX(int index, const float *in0, const float *in1, float *out, 3426+ int size) { 3427+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 3428+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3429+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 3430+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f); 3431+ SIMD_ST_F32(out + index, vout); 3432+ } 3433+ return index; 3434+} 3435+ 3436+static inline int ElementOptSubRelu6Num0AVX(int index, const float *in0, const float *in1, float *out, 3437+ int size) { 3438+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 3439+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3440+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 3441+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f); 3442+ SIMD_ST_F32(out + index, vout); 3443+ } 3444+ return index; 3445+} 3446+ 3447+static inline int ElementOptSubRelu6Num1AVX(int index, const float *in0, const float *in1, float *out, 3448+ int size) { 3449+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 3450+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3451+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 3452+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f); 3453+ SIMD_ST_F32(out + index, vout); 3454+ } 3455+ return index; 3456+} 3457+ 3458+static inline int ElementSubAVX(int index, const float *in0, const float *in1, float *out, int size) { 3459+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3460+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 3461+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 3462+ SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1); 3463+ SIMD_ST_F32(out + index, vout); 3464+ } 3465+ return index; 3466+} 3467+ 3468+static inline int ElementSubIntAVX(int index, const int *in0, const int *in1, int *out, int size) { 3469+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3470+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 3471+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 3472+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1); 3473+ SIMD_ST_EPI32(out + index, vout); 3474+ } 3475+ return index; 3476+} 3477+ 3478+static inline int ElementSubReluAVX(int index, const float *in0, const float *in1, float *out, 3479+ int size) { 3480+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3481+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 3482+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 3483+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f); 3484+ SIMD_ST_F32(out + index, vout); 3485+ } 3486+ return index; 3487+} 3488+ 3489+static inline int ElementSubRelu6AVX(int index, const float *in0, const float *in1, float *out, 3490+ int size) { 3491+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3492+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 3493+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 3494+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f); 3495+ SIMD_ST_F32(out + index, vout); 3496+ } 3497+ return index; 3498+} 3499+ 3500+#undef MS_SIMD_INSTRUCTION 3501+#undef BLOCK_NUM 3502+#pragma GCC pop_options 3503+#undef MS_SIMD_AVX 3504+#ifdef __cplusplus 3505+}; 3506+#endif 3507+#endif 3508diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h 3509new file mode 100644 3510index 00000000..f6457628 3511--- /dev/null 3512+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h 3513@@ -0,0 +1,221 @@ 3514+/** 3515+ * Copyright 2022 Huawei Technologies Co., Ltd 3516+ * 3517+ * Licensed under the Apache License, Version 2.0 (the "License"); 3518+ * you may not use this file except in compliance with the License. 3519+ * You may obtain a copy of the License at 3520+ * 3521+ * http://www.apache.org/licenses/LICENSE-2.0 3522+ * 3523+ * Unless required by applicable law or agreed to in writing, software 3524+ * distributed under the License is distributed on an "AS IS" BASIS, 3525+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 3526+ * See the License for the specific language governing permissions and 3527+ * limitations under the License. 3528+ */ 3529+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ 3530+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ 3531+ 3532+#include "nnacl/intrinsics/ms_simd_instructions.h" 3533+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 3534+ 3535+#ifdef __cplusplus 3536+extern "C" { 3537+#endif 3538+#pragma GCC push_options 3539+#pragma GCC target("avx512f") 3540+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 3541+#define BLOCK_NUM 16 3542+#define MS_SIMD_AVX512 3543+ 3544+static inline int Fp32ReluAVX512(int index, const float *src, int length, float *dst) { 3545+ SIMD_F32 zero = SIMD_SET0_F32; 3546+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3547+ SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero)); 3548+ } 3549+ return index; 3550+} 3551+ 3552+static inline int Int32ReluAVX512(int index, const int32_t *src, int length, int32_t *dst) { 3553+ SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f); 3554+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3555+ SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero)); 3556+ } 3557+ return index; 3558+} 3559+ 3560+static inline int Fp32Relu6AVX512(int index, const float *src, int length, float *dst) { 3561+ SIMD_F32 zero = SIMD_SET0_F32; 3562+ SIMD_F32 six = SIMD_MOV_F32(6.0f); 3563+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3564+ SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six)); 3565+ } 3566+ return index; 3567+} 3568+ 3569+static inline int LReluAVX512(int index, const float *src, int length, float *dst, float alpha) { 3570+ SIMD_F32 alpha_data = SIMD_MOV_F32(alpha); 3571+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3572+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 3573+ SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp); 3574+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask)); 3575+ } 3576+ return index; 3577+} 3578+ 3579+static inline int SigmoidAVX512(int index, const float *src, int length, float *dst) { 3580+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3581+ SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index); 3582+ SIMD_ST_F32(dst + index, 3583+ SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); 3584+ } 3585+ return index; 3586+} 3587+ 3588+static inline int TanhAVX512(int index, const float *src, int length, float *dst) { 3589+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3590+ SIMD_F32 input = SIMD_LD_F32(src + index); 3591+ SIMD_ST_F32(dst + index, SIMD_TANH_F32(input)); 3592+ } 3593+ return index; 3594+} 3595+ 3596+static inline int SwishAVX512(int index, const float *src, int length, float *dst) { 3597+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3598+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 3599+ SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index); 3600+ SIMD_ST_F32(dst + index, 3601+ SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); 3602+ } 3603+ return index; 3604+} 3605+ 3606+static inline int HSwishAVX512(int index, const float *src, int length, float *dst) { 3607+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3608+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 3609+ SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); 3610+ SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6)); 3611+ } 3612+ return index; 3613+} 3614+ 3615+static inline int HSigmoidAVX512(int index, const float *src, int length, float *dst) { 3616+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3617+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 3618+ SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); 3619+ SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6)); 3620+ } 3621+ return index; 3622+} 3623+ 3624+static inline int HardTanhNoLimitMinAVX512(int index, const float *src, int length, float *dst, float min_val, 3625+ float max_val) { 3626+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3627+ SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val)); 3628+ } 3629+ return index; 3630+} 3631+ 3632+static inline int HardTanhNoLimitMaxAVX512(int index, const float *src, int length, float *dst, float min_val, 3633+ float max_val) { 3634+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3635+ SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val)); 3636+ } 3637+ return index; 3638+} 3639+ 3640+static inline int HardTanhLimitMinMaxAVX512(int index, const float *src, int length, float *dst, float min_val, 3641+ float max_val) { 3642+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3643+ SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val)); 3644+ } 3645+ return index; 3646+} 3647+ 3648+static inline int GeluApproximateAVX512(int index, const float *src, int length, float *dst) { 3649+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3650+ SIMD_F32 in = SIMD_LD_F32(src + index); 3651+ SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in); 3652+ SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in); 3653+ SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f))); 3654+ } 3655+ return index; 3656+} 3657+ 3658+static inline int GeluAVX512(int index, const float *src, int length, float *dst) { 3659+ SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f); 3660+ SIMD_F32 para2 = SIMD_MOV_F32(1.0f); 3661+ SIMD_F32 para3 = SIMD_MOV_F32(0.5f); 3662+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3663+ SIMD_F32 in = SIMD_LD_F32(src + index); 3664+ SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1)))); 3665+ SIMD_ST_F32(dst + index, res); 3666+ } 3667+ return index; 3668+} 3669+ 3670+static inline int EluAVX512(int index, const float *src, int length, float *dst, float alpha) { 3671+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3672+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 3673+ SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f); 3674+ SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); 3675+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); 3676+ } 3677+ return index; 3678+} 3679+ 3680+static inline int CeluAVX512(int index, const float *src, int length, float *dst, float alpha) { 3681+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3682+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 3683+ SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f); 3684+ SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); 3685+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); 3686+ } 3687+ return index; 3688+} 3689+ 3690+static inline int HShrinkAVX512(int index, const float *src, int length, float *dst, float lambd) { 3691+ const float neg_lambd = -1 * lambd; 3692+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3693+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 3694+ SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd)); 3695+ SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp); 3696+ SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); 3697+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask)); 3698+ } 3699+ return index; 3700+} 3701+ 3702+static inline int SoftShrinkAVX512(int index, const float *src, int length, float *dst, float lambd) { 3703+ SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); 3704+ SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); 3705+ 3706+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3707+ SIMD_F32 src_t = SIMD_LD_F32(src + index); 3708+ /* v0 = (in > lamdb) & (in - lamdb) */ 3709+ SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v)); 3710+ /* v1 = (in < -lamdb) & (in + lamdb) */ 3711+ SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v)); 3712+ /* out = (v0 | v1) */ 3713+ SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1)); 3714+ } 3715+ return index; 3716+} 3717+ 3718+static inline int SoftsignFp32OptAVX512(int index, const float *src, int length, float *dst) { 3719+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3720+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 3721+ SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp)); 3722+ SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp)); 3723+ } 3724+ return index; 3725+} 3726+ 3727+#undef MS_SIMD_INSTRUCTION 3728+#undef BLOCK_NUM 3729+#pragma GCC pop_options 3730+#undef MS_SIMD_AVX512 3731+#ifdef __cplusplus 3732+} 3733+#endif 3734+#endif 3735diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h 3736new file mode 100644 3737index 00000000..62d34db4 3738--- /dev/null 3739+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h 3740@@ -0,0 +1,57 @@ 3741+/** 3742+ * Copyright 2022 Huawei Technologies Co., Ltd 3743+ * 3744+ * Licensed under the Apache License, Version 2.0 (the "License"); 3745+ * you may not use this file except in compliance with the License. 3746+ * You may obtain a copy of the License at 3747+ * 3748+ * http://www.apache.org/licenses/LICENSE-2.0 3749+ * 3750+ * Unless required by applicable law or agreed to in writing, software 3751+ * distributed under the License is distributed on an "AS IS" BASIS, 3752+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 3753+ * See the License for the specific language governing permissions and 3754+ * limitations under the License. 3755+ */ 3756+ 3757+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX512_H_ 3758+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX512_H_ 3759+ 3760+#include "nnacl/intrinsics/ms_simd_instructions.h" 3761+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 3762+ 3763+#ifdef __cplusplus 3764+extern "C" { 3765+#endif 3766+#pragma GCC push_options 3767+#pragma GCC target("avx512f") 3768+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 3769+#define BLOCK_NUM 16 3770+#define MS_SIMD_AVX512 3771+ 3772+static inline int ShrinkGradAVX512(int index, const float *src0, const float *src1, 3773+ int length, float *dst, float lambd) { 3774+ SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); 3775+ SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); 3776+ 3777+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3778+ SIMD_F32 src0_t = SIMD_LD_F32(src0 + index); 3779+ SIMD_F32 src1_t = SIMD_LD_F32(src1 + index); 3780+ 3781+ SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v); 3782+ SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t); 3783+ SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); 3784+ 3785+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask)); 3786+ } 3787+ return index; 3788+} 3789+ 3790+#undef MS_SIMD_INSTRUCTION 3791+#undef BLOCK_NUM 3792+#pragma GCC pop_options 3793+#undef MS_SIMD_AVX512 3794+#ifdef __cplusplus 3795+} 3796+#endif 3797+#endif 3798diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h 3799new file mode 100644 3800index 00000000..0579d58a 3801--- /dev/null 3802+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h 3803@@ -0,0 +1,210 @@ 3804+/** 3805+ * Copyright 2022 Huawei Technologies Co., Ltd 3806+ * 3807+ * Licensed under the Apache License, Version 2.0 (the "License"); 3808+ * you may not use this file except in compliance with the License. 3809+ * You may obtain a copy of the License at 3810+ * 3811+ * http://www.apache.org/licenses/LICENSE-2.0 3812+ * 3813+ * Unless required by applicable law or agreed to in writing, software 3814+ * distributed under the License is distributed on an "AS IS" BASIS, 3815+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 3816+ * See the License for the specific language governing permissions and 3817+ * limitations under the License. 3818+ */ 3819+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_AVX512_H_ 3820+#define MINDSPORE_NNACL_FP32_ADAM_FP32_AVX512_H_ 3821+ 3822+#include "nnacl/intrinsics/ms_simd_instructions.h" 3823+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 3824+ 3825+#ifdef __cplusplus 3826+extern "C" { 3827+#endif 3828+#pragma GCC push_options 3829+#pragma GCC target("avx512f") 3830+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 3831+#define BLOCK_NUM 16 3832+#define MS_SIMD_AVX512 3833+#ifdef MS_SIMD_AVX512 3834+ static inline size_t AdamWeightDecayFp32AVX512(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 3835+ const float *gradient, size_t end) { 3836+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 3837+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 3838+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 3839+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 3840+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 3841+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 3842+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 3843+ 3844+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3845+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 3846+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 3847+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 3848+ SIMD_F32 g_r = SIMD_LD_F32(gradient + index); 3849+ 3850+ m_r = SIMD_MUL_F32(m_r, beta1_r); 3851+ v_r = SIMD_MUL_F32(v_r, beta2_r); 3852+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 3853+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 3854+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 3855+ avx_r0 = SIMD_SQRT_F32(v_r); 3856+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 3857+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 3858+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 3859+ SIMD_ST_F32(m + index, m_r); 3860+ SIMD_ST_F32(v + index, v_r); 3861+ SIMD_ST_F32(var + index, var_r); 3862+ } 3863+ 3864+ return index; 3865+} 3866+ 3867+static inline size_t FusedCastAdamFp32Fp16AVX512(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 3868+ float global_norm_reciprocal, size_t end) { 3869+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 3870+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 3871+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 3872+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 3873+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 3874+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 3875+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 3876+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 3877+ 3878+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3879+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 3880+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 3881+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 3882+ SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); 3883+ 3884+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 3885+ m_r = SIMD_MUL_F32(m_r, beta1_r); 3886+ v_r = SIMD_MUL_F32(v_r, beta2_r); 3887+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 3888+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 3889+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 3890+ avx_r0 = SIMD_SQRT_F32(v_r); 3891+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 3892+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 3893+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 3894+ SIMD_ST_F32(var + index, var_r); 3895+ SIMD_ST_F32(m + index, m_r); 3896+ SIMD_ST_F32(v + index, v_r); 3897+ } 3898+ 3899+ return index; 3900+} 3901+ 3902+static inline size_t FusedCastAdamFp32Fp32AVX512(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 3903+ float global_norm_reciprocal, size_t end) { 3904+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 3905+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 3906+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 3907+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 3908+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 3909+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 3910+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 3911+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 3912+ 3913+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3914+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 3915+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 3916+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 3917+ SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); 3918+ 3919+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 3920+ m_r = SIMD_MUL_F32(m_r, beta1_r); 3921+ v_r = SIMD_MUL_F32(v_r, beta2_r); 3922+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 3923+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 3924+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 3925+ avx_r0 = SIMD_SQRT_F32(v_r); 3926+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 3927+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 3928+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 3929+ SIMD_ST_F32(var + index, var_r); 3930+ SIMD_ST_F32(m + index, m_r); 3931+ SIMD_ST_F32(v + index, v_r); 3932+ } 3933+ 3934+ return index; 3935+} 3936+ 3937+static inline size_t FusedCastAdamFp16Fp16AVX512(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 3938+ float global_norm_reciprocal, size_t end) { 3939+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 3940+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 3941+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 3942+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 3943+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 3944+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 3945+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 3946+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 3947+ 3948+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3949+ SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); 3950+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 3951+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 3952+ SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); 3953+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 3954+ m_r = SIMD_MUL_F32(m_r, beta1_r); 3955+ v_r = SIMD_MUL_F32(v_r, beta2_r); 3956+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 3957+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 3958+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 3959+ avx_r0 = SIMD_SQRT_F32(v_r); 3960+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 3961+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 3962+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 3963+ SIMD_ST_F32(m + index, m_r); 3964+ SIMD_ST_F32(v + index, v_r); 3965+ SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); 3966+ } 3967+ 3968+ return index; 3969+} 3970+ 3971+static inline size_t FusedCastAdamFp16Fp32AVX512(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 3972+ float global_norm_reciprocal, size_t end) { 3973+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 3974+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 3975+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 3976+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 3977+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 3978+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 3979+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 3980+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 3981+ 3982+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 3983+ SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); 3984+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 3985+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 3986+ SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); 3987+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 3988+ m_r = SIMD_MUL_F32(m_r, beta1_r); 3989+ v_r = SIMD_MUL_F32(v_r, beta2_r); 3990+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 3991+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 3992+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 3993+ avx_r0 = SIMD_SQRT_F32(v_r); 3994+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 3995+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 3996+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 3997+ SIMD_ST_F32(m + index, m_r); 3998+ SIMD_ST_F32(v + index, v_r); 3999+ SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); 4000+ } 4001+ 4002+ return index; 4003+} 4004+#endif 4005+ 4006+#undef MS_SIMD_INSTRUCTION 4007+#undef BLOCK_NUM 4008+#pragma GCC pop_options 4009+#undef MS_SIMD_AVX512 4010+#ifdef __cplusplus 4011+} 4012+#endif 4013+#endif 4014diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h 4015new file mode 100644 4016index 00000000..5ec6a42e 4017--- /dev/null 4018+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h 4019@@ -0,0 +1,124 @@ 4020+/** 4021+ * Copyright 2022 Huawei Technologies Co., Ltd 4022+ * 4023+ * Licensed under the Apache License, Version 2.0 (the "License"); 4024+ * you may not use this file except in compliance with the License. 4025+ * You may obtain a copy of the License at 4026+ * 4027+ * http://www.apache.org/licenses/LICENSE-2.0 4028+ * 4029+ * Unless required by applicable law or agreed to in writing, software 4030+ * distributed under the License is distributed on an "AS IS" BASIS, 4031+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 4032+ * See the License for the specific language governing permissions and 4033+ * limitations under the License. 4034+ */ 4035+ 4036+#ifndef MINDSPORE_NNACL_FP32_ADD_AVX512_H_ 4037+#define MINDSPORE_NNACL_FP32_ADD_AVX512_H_ 4038+ 4039+#include "nnacl/intrinsics/ms_simd_instructions.h" 4040+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 4041+ 4042+#ifdef __cplusplus 4043+extern "C" { 4044+#endif 4045+#pragma GCC push_options 4046+#pragma GCC target("avx512f") 4047+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 4048+#define BLOCK_NUM 16 4049+#define MS_SIMD_AVX512 4050+ 4051+static inline int ElementOptAddAVX512(int index, const float *in0, const float *in1, float *out, int size) { 4052+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 4053+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4054+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 4055+ SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1); 4056+ SIMD_ST_F32(out + index, vout); 4057+ } 4058+ return index; 4059+} 4060+ 4061+static inline int ElementOptAddIntAVX512(int index, const int *in0, const int *in1, int *out, 4062+ int size) { 4063+ SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]); 4064+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4065+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 4066+ SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1); 4067+ SIMD_ST_EPI32(out + index, vout); 4068+ } 4069+ return index; 4070+} 4071+ 4072+static inline int ElementOptAddReluAVX512(int index, const float *in0, const float *in1, float *out, 4073+ int size) { 4074+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 4075+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4076+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 4077+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f); 4078+ SIMD_ST_F32(out + index, vout); 4079+ } 4080+ return index; 4081+} 4082+ 4083+static inline int ElementOptAddRelu6AVX512(int index, const float *in0, const float *in1, float *out, 4084+ int size) { 4085+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 4086+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4087+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 4088+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f); 4089+ SIMD_ST_F32(out + index, vout); 4090+ } 4091+ return index; 4092+} 4093+ 4094+static inline int ElementAddAVX512(int index, const float *in0, const float *in1, float *out, int size) { 4095+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4096+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 4097+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 4098+ SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); 4099+ SIMD_ST_F32(out + index, vout); 4100+ } 4101+ return index; 4102+} 4103+ 4104+static inline int ElementAddReluAVX512(int index, const float *in0, const float *in1, float *out, 4105+ int size) { 4106+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4107+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 4108+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 4109+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f); 4110+ SIMD_ST_F32(out + index, vout); 4111+ } 4112+ return index; 4113+} 4114+ 4115+static inline int ElementAddRelu6AVX512(int index, const float *in0, const float *in1, float *out, 4116+ int size) { 4117+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4118+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 4119+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 4120+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f); 4121+ SIMD_ST_F32(out + index, vout); 4122+ } 4123+ return index; 4124+} 4125+ 4126+static inline int ElementAddIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { 4127+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4128+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 4129+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 4130+ SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1); 4131+ SIMD_ST_EPI32(out + index, vout); 4132+ } 4133+ return index; 4134+} 4135+ 4136+#undef MS_SIMD_INSTRUCTION 4137+#undef BLOCK_NUM 4138+#pragma GCC pop_options 4139+#undef MS_SIMD_AVX512 4140+#ifdef __cplusplus 4141+} 4142+#endif 4143+#endif 4144diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h 4145new file mode 100644 4146index 00000000..aa478969 4147--- /dev/null 4148+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h 4149@@ -0,0 +1,254 @@ 4150+/** 4151+ * Copyright 2022 Huawei Technologies Co., Ltd 4152+ * 4153+ * Licensed under the Apache License, Version 2.0 (the "License"); 4154+ * you may not use this file except in compliance with the License. 4155+ * You may obtain a copy of the License at 4156+ * 4157+ * http://www.apache.org/licenses/LICENSE-2.0 4158+ * 4159+ * Unless required by applicable law or agreed to in writing, software 4160+ * distributed under the License is distributed on an "AS IS" BASIS, 4161+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 4162+ * See the License for the specific language governing permissions and 4163+ * limitations under the License. 4164+ */ 4165+ 4166+#ifndef MINDSPORE_NNACL_ARITHMETIC_AVX512_H_ 4167+#define MINDSPORE_NNACL_ARITHMETIC_AVX512_H_ 4168+ 4169+#include "nnacl/intrinsics/ms_simd_instructions.h" 4170+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 4171+ 4172+#ifdef __cplusplus 4173+extern "C" { 4174+#endif 4175+#pragma GCC push_options 4176+#pragma GCC target("avx512f") 4177+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 4178+#define BLOCK_NUM 16 4179+#define MS_SIMD_AVX512 4180+ 4181+#ifndef MS_SIMD_NEON 4182+static inline int ElementFloorModAVX512(int index, const float *in0, const float *in1, float *out, int size) { 4183+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4184+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 4185+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 4186+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 4187+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 4188+ SIMD_ST_F32(out + index, out_tmp); 4189+ } 4190+ return index; 4191+} 4192+ 4193+static inline int ElementOptFloorModNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { 4194+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 4195+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4196+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 4197+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 4198+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 4199+ SIMD_ST_F32(out + index, out_tmp); 4200+ } 4201+ return index; 4202+} 4203+ 4204+static inline int ElementOptFloorModNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { 4205+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 4206+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4207+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 4208+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 4209+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 4210+ SIMD_ST_F32(out + index, out_tmp); 4211+ } 4212+ return index; 4213+} 4214+ 4215+static inline int ElementFloorDivAVX512(int index, const float *in0, const float *in1, float *out, int size) { 4216+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4217+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 4218+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 4219+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 4220+ SIMD_ST_F32(out + index, floor_tmp); 4221+ } 4222+ return index; 4223+} 4224+ 4225+static inline int ElementOptFloorDivNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { 4226+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 4227+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4228+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 4229+ SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 4230+ SIMD_ST_F32(out + index, out_tmp); 4231+ } 4232+ return index; 4233+} 4234+ 4235+static inline int ElementOptFloorDivNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { 4236+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 4237+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4238+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 4239+ SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 4240+ SIMD_ST_F32(out + index, out_tmp); 4241+ } 4242+ return index; 4243+} 4244+#endif 4245+ 4246+static inline int ElementFloorDivIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { 4247+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4248+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 4249+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 4250+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 4251+ SIMD_ST_EPI32(out + index, out_tmp); 4252+ } 4253+ return index; 4254+} 4255+ 4256+static inline int ElementOptFloorDivIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { 4257+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 4258+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4259+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 4260+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 4261+ SIMD_ST_EPI32(out + index, out_tmp); 4262+ } 4263+ return index; 4264+} 4265+ 4266+static inline int ElementOptFloorDivIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { 4267+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 4268+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4269+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 4270+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 4271+ SIMD_ST_EPI32(out + index, out_tmp); 4272+ } 4273+ return index; 4274+} 4275+ 4276+static inline int ElementMaximumAVX512(int index, const float *in0, const float *in1, float *out, int size) { 4277+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4278+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 4279+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 4280+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 4281+ SIMD_ST_F32(out + index, out_tmp); 4282+ } 4283+ return index; 4284+} 4285+ 4286+static inline int ElementOptMaximumNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { 4287+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 4288+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4289+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 4290+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 4291+ SIMD_ST_F32(out + index, out_tmp); 4292+ } 4293+ return index; 4294+} 4295+ 4296+static inline int ElementOptMaximumNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { 4297+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 4298+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4299+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 4300+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 4301+ SIMD_ST_F32(out + index, out_tmp); 4302+ } 4303+ return index; 4304+} 4305+ 4306+static inline int ElementMaximumIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { 4307+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4308+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 4309+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 4310+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 4311+ SIMD_ST_EPI32(out + index, out_tmp); 4312+ } 4313+ return index; 4314+} 4315+ 4316+static inline int ElementOptMaximumIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { 4317+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 4318+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4319+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 4320+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 4321+ SIMD_ST_EPI32(out + index, out_tmp); 4322+ } 4323+ return index; 4324+} 4325+ 4326+static inline int ElementOptMaximumIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { 4327+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 4328+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4329+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 4330+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 4331+ SIMD_ST_EPI32(out + index, out_tmp); 4332+ } 4333+ return index; 4334+} 4335+ 4336+static inline int ElementMinimumIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { 4337+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4338+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 4339+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 4340+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 4341+ SIMD_ST_EPI32(out + index, out_tmp); 4342+ } 4343+ return index; 4344+} 4345+ 4346+static inline int ElementOptMinimumIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { 4347+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 4348+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4349+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 4350+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 4351+ SIMD_ST_EPI32(out + index, out_tmp); 4352+ } 4353+ return index; 4354+} 4355+ 4356+static inline int ElementOptMinimumIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { 4357+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 4358+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4359+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 4360+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 4361+ SIMD_ST_EPI32(out + index, out_tmp); 4362+ } 4363+ return index; 4364+} 4365+ 4366+static inline int ElementMinimumAVX512(int index, const float *in0, const float *in1, float *out, int size) { 4367+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4368+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 4369+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 4370+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 4371+ SIMD_ST_F32(out + index, out_tmp); 4372+ } 4373+ return index; 4374+} 4375+ 4376+static inline int ElementOptMinimumNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { 4377+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 4378+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4379+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 4380+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 4381+ SIMD_ST_F32(out + index, out_tmp); 4382+ } 4383+ return index; 4384+} 4385+ 4386+static inline int ElementOptMinimumNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { 4387+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 4388+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4389+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 4390+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 4391+ SIMD_ST_F32(out + index, out_tmp); 4392+ } 4393+ return index; 4394+} 4395+ 4396+#undef MS_SIMD_INSTRUCTION 4397+#undef BLOCK_NUM 4398+#pragma GCC pop_options 4399+#undef MS_SIMD_AVX512 4400+#ifdef __cplusplus 4401+} 4402+#endif 4403+#endif 4404diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h 4405new file mode 100644 4406index 00000000..c671e327 4407--- /dev/null 4408+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h 4409@@ -0,0 +1,129 @@ 4410+/** 4411+ * Copyright 2022 Huawei Technologies Co., Ltd 4412+ * 4413+ * Licensed under the Apache License, Version 2.0 (the "License"); 4414+ * you may not use this file except in compliance with the License. 4415+ * You may obtain a copy of the License at 4416+ * 4417+ * http://www.apache.org/licenses/LICENSE-2.0 4418+ * 4419+ * Unless required by applicable law or agreed to in writing, software 4420+ * distributed under the License is distributed on an "AS IS" BASIS, 4421+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 4422+ * See the License for the specific language governing permissions and 4423+ * limitations under the License. 4424+ */ 4425+ 4426+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_AVX512_H_ 4427+#define MINDSPORE_NNACL_ARITHMETIC_SELF_AVX512_H_ 4428+ 4429+#include "nnacl/intrinsics/ms_simd_instructions.h" 4430+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 4431+ 4432+#ifdef __cplusplus 4433+extern "C" { 4434+#endif 4435+#pragma GCC push_options 4436+#pragma GCC target("avx512f") 4437+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 4438+#define BLOCK_NUM 16 4439+#define MS_SIMD_AVX512 4440+ 4441+#if defined(MS_SIMD_AVX512) 4442+// only avx512 support abs fp32 instruction 4443+static inline int ElementAbsAVX512(int index, const float *input, float *output, const int element_size) { 4444+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4445+ SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index))); 4446+ } 4447+ return index; 4448+} 4449+ 4450+static inline int ElementAbsIntAVX512(int index, const int *input, int *output, const int element_size) { 4451+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4452+ SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index))); 4453+ } 4454+ return index; 4455+} 4456+#endif 4457+ 4458+static inline int ElementSquareAVX512(int index, const float *input, float *output, const int element_size) { 4459+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4460+ SIMD_F32 vin = SIMD_LD_F32(input + index); 4461+ SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin)); 4462+ } 4463+ return index; 4464+} 4465+ 4466+static inline int ElementSqrtAVX512(int index, const float *input, float *output, const int element_size) { 4467+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4468+ SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index))); 4469+ } 4470+ return index; 4471+} 4472+ 4473+static inline int ElementRsqrtAVX512(int index, const float *input, float *output, const int element_size) { 4474+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4475+ SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index))); 4476+ } 4477+ return index; 4478+} 4479+ 4480+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE) 4481+// avx512 dont support round fp32 instruction 4482+static inline int ElementRoundAVX512(int index, const float *input, float *output, const int element_size) { 4483+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4484+ SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index))); 4485+ } 4486+ return index; 4487+} 4488+#endif 4489+ 4490+#ifndef MS_SIMD_NEON 4491+// neon dont support floor fp32 instruction 4492+static inline int ElementFloorAVX512(int index, const float *input, float *output, const int element_size) { 4493+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4494+ SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index))); 4495+ } 4496+ return index; 4497+} 4498+#endif 4499+ 4500+#ifndef MS_SIMD_NEON 4501+static inline int ElementCeilAVX512(int index, const float *input, float *output, const int element_size) { 4502+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4503+ SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index))); 4504+ } 4505+ return index; 4506+} 4507+#endif 4508+ 4509+static inline int ElementNegativeAVX512(int index, const float *input, float *output, const int element_size) { 4510+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4511+ SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f)); 4512+ } 4513+ return index; 4514+} 4515+ 4516+static inline int ElementNegativeIntAVX512(int index, const int *input, int *output, const int element_size) { 4517+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4518+ SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1)); 4519+ } 4520+ return index; 4521+} 4522+ 4523+static inline int ElementReciprocalAVX512(int index, const float *input, float *output, const int element_size) { 4524+ SIMD_F32 num1 = SIMD_MOV_F32(1.0f); 4525+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4526+ SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index))); 4527+ } 4528+ return index; 4529+} 4530+ 4531+#undef MS_SIMD_INSTRUCTION 4532+#undef BLOCK_NUM 4533+#pragma GCC pop_options 4534+#undef MS_SIMD_AVX512 4535+#ifdef __cplusplus 4536+} 4537+#endif 4538+#endif 4539diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h 4540new file mode 100644 4541index 00000000..fd945984 4542--- /dev/null 4543+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h 4544@@ -0,0 +1,67 @@ 4545+/** 4546+ * Copyright 2022 Huawei Technologies Co., Ltd 4547+ * 4548+ * Licensed under the Apache License, Version 2.0 (the "License"); 4549+ * you may not use this file except in compliance with the License. 4550+ * You may obtain a copy of the License at 4551+ * 4552+ * http://www.apache.org/licenses/LICENSE-2.0 4553+ * 4554+ * Unless required by applicable law or agreed to in writing, software 4555+ * distributed under the License is distributed on an "AS IS" BASIS, 4556+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 4557+ * See the License for the specific language governing permissions and 4558+ * limitations under the License. 4559+ */ 4560+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ 4561+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ 4562+ 4563+#include "nnacl/intrinsics/ms_simd_instructions.h" 4564+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 4565+ 4566+#ifdef __cplusplus 4567+extern "C" { 4568+#endif 4569+#pragma GCC push_options 4570+#pragma GCC target("avx512f") 4571+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 4572+#define BLOCK_NUM 16 4573+#define MS_SIMD_AVX512 4574+ 4575+static inline int BatchNormFp32AVX512(int index, const float *input, const float *mean, 4576+ const float *variance, int channel, float epsilon, float *output) { 4577+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4578+ SIMD_F32 input_data = SIMD_LD_F32(input + index); 4579+ SIMD_F32 mean_ = SIMD_LD_F32(mean + index); 4580+ SIMD_F32 variance_ = SIMD_LD_F32(variance + index); 4581+ SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); 4582+ SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); 4583+ SIMD_ST_F32(output + index, output_data); 4584+ } 4585+ return index; 4586+} 4587+ 4588+static inline int FusedBatchNormFp32AVX512(int index, const float *input, const float *scale, 4589+ const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) { 4590+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4591+ SIMD_F32 input_data = SIMD_LD_F32(input + index); 4592+ SIMD_F32 scale_ = SIMD_LD_F32(scale + index); 4593+ SIMD_F32 offset_ = SIMD_LD_F32(offset + index); 4594+ SIMD_F32 mean_ = SIMD_LD_F32(mean + index); 4595+ SIMD_F32 variance_ = SIMD_LD_F32(variance + index); 4596+ SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); 4597+ SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); 4598+ SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_); 4599+ SIMD_ST_F32(output + index, output_data); 4600+ } 4601+ return index; 4602+} 4603+ 4604+#undef MS_SIMD_INSTRUCTION 4605+#undef BLOCK_NUM 4606+#pragma GCC pop_options 4607+#undef MS_SIMD_AVX512 4608+#ifdef __cplusplus 4609+} 4610+#endif 4611+#endif 4612diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h 4613new file mode 100644 4614index 00000000..f5353f61 4615--- /dev/null 4616+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h 4617@@ -0,0 +1,69 @@ 4618+/** 4619+ * Copyright 2022 Huawei Technologies Co., Ltd 4620+ * 4621+ * Licensed under the Apache License, Version 2.0 (the "License"); 4622+ * you may not use this file except in compliance with the License. 4623+ * You may obtain a copy of the License at 4624+ * 4625+ * http://www.apache.org/licenses/LICENSE-2.0 4626+ * 4627+ * Unless required by applicable law or agreed to in writing, software 4628+ * distributed under the License is distributed on an "AS IS" BASIS, 4629+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 4630+ * See the License for the specific language governing permissions and 4631+ * limitations under the License. 4632+ */ 4633+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX512_H_ 4634+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX512_H_ 4635+ 4636+#include "nnacl/intrinsics/ms_simd_instructions.h" 4637+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 4638+ 4639+#ifdef __cplusplus 4640+extern "C" { 4641+#endif 4642+#pragma GCC push_options 4643+#pragma GCC target("avx512f") 4644+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 4645+#define BLOCK_NUM 16 4646+#define MS_SIMD_AVX512 4647+ 4648+static inline int BCEWithLogitLossAVX512(int index, const float *logits, const float *label, 4649+ const float *weight, const float *pos_weight, int length, bool reduction, float *output, 4650+ float *reduction_sum) { 4651+ SIMD_F32 zero = SIMD_SET0_F32; 4652+ SIMD_F32 ones = SIMD_MOV_F32(1.0f); 4653+ SIMD_F32 middle_output = SIMD_SET0_F32; 4654+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4655+ SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index); 4656+ SIMD_F32 label_tmp = SIMD_LD_F32(label + index); 4657+ SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index); 4658+ SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index); 4659+ SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp); 4660+ SIMD_F32 max_value = neg_logits_tmp; 4661+ max_value = SIMD_MIN_F32(max_value, zero); 4662+ SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value); 4663+ SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones); 4664+ SIMD_F32 log_exp_value = 4665+ SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value)))); 4666+ SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp), 4667+ SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value))); 4668+ if (reduction) { 4669+ middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output); 4670+ } else { 4671+ SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp)); 4672+ } 4673+ } 4674+ if (reduction) { 4675+ *reduction_sum += SIMD_GET_SUM_F32(middle_output); 4676+ } 4677+ return index; 4678+} 4679+#undef MS_SIMD_INSTRUCTION 4680+#undef BLOCK_NUM 4681+#pragma GCC pop_options 4682+#undef MS_SIMD_AVX512 4683+#ifdef __cplusplus 4684+} 4685+#endif 4686+#endif 4687diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h 4688new file mode 100644 4689index 00000000..abdad5ff 4690--- /dev/null 4691+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h 4692@@ -0,0 +1,64 @@ 4693+/** 4694+ * Copyright 2022 Huawei Technologies Co., Ltd 4695+ * 4696+ * Licensed under the Apache License, Version 2.0 (the "License"); 4697+ * you may not use this file except in compliance with the License. 4698+ * You may obtain a copy of the License at 4699+ * 4700+ * http://www.apache.org/licenses/LICENSE-2.0 4701+ * 4702+ * Unless required by applicable law or agreed to in writing, software 4703+ * distributed under the License is distributed on an "AS IS" BASIS, 4704+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 4705+ * See the License for the specific language governing permissions and 4706+ * limitations under the License. 4707+ */ 4708+ 4709+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_AVX512_H_ 4710+#define MINDSPORE_NNACL_FP32_BIAS_ADD_AVX512_H_ 4711+ 4712+#include "nnacl/intrinsics/ms_simd_instructions.h" 4713+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 4714+ 4715+#ifdef __cplusplus 4716+extern "C" { 4717+#endif 4718+#pragma GCC push_options 4719+#pragma GCC target("avx512f") 4720+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 4721+#define BLOCK_NUM 16 4722+#define MS_SIMD_AVX512 4723+ 4724+static inline int BiasAddByInnerCoreAVX512(int index, const float *input, const float *bias, float *output, 4725+ int64_t num) { 4726+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4727+ SIMD_F32 vin0 = SIMD_LD_F32(input + index); 4728+ SIMD_F32 vin1 = SIMD_LD_F32(bias + index); 4729+ SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); 4730+ SIMD_ST_F32(output + index, vout); 4731+ } 4732+ return index; 4733+} 4734+ 4735+static inline int BiasAddByBatchCoreAVX512(int index, const float *input, const float *bias, float *output1, 4736+ float *output2, float *output3, float *output4, int64_t num) { 4737+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4738+ SIMD_LDX4_F32(input_data, input + index, num); 4739+ SIMD_F32 bias_data = SIMD_LD_F32(bias + index); 4740+ SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data)); 4741+ SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data)); 4742+ SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data)); 4743+ SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data)); 4744+ } 4745+ return index; 4746+} 4747+ 4748+#undef MS_SIMD_INSTRUCTION 4749+#undef BLOCK_NUM 4750+#pragma GCC pop_options 4751+#undef MS_SIMD_AVX512 4752+#ifdef __cplusplus 4753+}; 4754+#endif 4755+ 4756+#endif // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_ 4757diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h 4758new file mode 100644 4759index 00000000..91d52718 4760--- /dev/null 4761+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h 4762@@ -0,0 +1,56 @@ 4763+/** 4764+ * Copyright 2022 Huawei Technologies Co., Ltd 4765+ * 4766+ * Licensed under the Apache License, Version 2.0 (the "License"); 4767+ * you may not use this file except in compliance with the License. 4768+ * You may obtain a copy of the License at 4769+ * 4770+ * http://www.apache.org/licenses/LICENSE-2.0 4771+ * 4772+ * Unless required by applicable law or agreed to in writing, software 4773+ * distributed under the License is distributed on an "AS IS" BASIS, 4774+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 4775+ * See the License for the specific language governing permissions and 4776+ * limitations under the License. 4777+ */ 4778+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_AVX512_H_ 4779+#define MINDSPORE_NNACL_BASE_CAST_BASE_AVX512_H_ 4780+ 4781+#include "nnacl/intrinsics/ms_simd_instructions.h" 4782+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 4783+ 4784+#ifdef __cplusplus 4785+extern "C" { 4786+#endif 4787+#pragma GCC push_options 4788+#pragma GCC target("avx512f") 4789+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 4790+#define BLOCK_NUM 16 4791+#define MS_SIMD_AVX512 4792+ 4793+static inline int Int32ToFloat32AVX512(int index, const int32_t *input, float *output, int number) { 4794+ for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4795+ SIMD_EPI32 value = SIMD_LD_EPI32(input + index); 4796+ SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value)); 4797+ } 4798+ return index; 4799+} 4800+ 4801+#ifndef MS_SIMD_NEON 4802+static inline int Float32ToInt32AVX512(int index, const float *input, int32_t *output, int number) { 4803+ for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4804+ SIMD_F32 value = SIMD_LD_F32(input + index); 4805+ SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value)); 4806+ } 4807+ return index; 4808+} 4809+#endif 4810+ 4811+#undef MS_SIMD_INSTRUCTION 4812+#undef BLOCK_NUM 4813+#pragma GCC pop_options 4814+#undef MS_SIMD_AVX512 4815+#ifdef __cplusplus 4816+} 4817+#endif 4818+#endif 4819diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h 4820new file mode 100644 4821index 00000000..11a2abcf 4822--- /dev/null 4823+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h 4824@@ -0,0 +1,70 @@ 4825+/** 4826+ * Copyright 2022 Huawei Technologies Co., Ltd 4827+ * 4828+ * Licensed under the Apache License, Version 2.0 (the "License"); 4829+ * you may not use this file except in compliance with the License. 4830+ * You may obtain a copy of the License at 4831+ * 4832+ * http://www.apache.org/licenses/LICENSE-2.0 4833+ * 4834+ * Unless required by applicable law or agreed to in writing, software 4835+ * distributed under the License is distributed on an "AS IS" BASIS, 4836+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 4837+ * See the License for the specific language governing permissions and 4838+ * limitations under the License. 4839+ */ 4840+#ifndef MINDSPORE_NNACL_FP32_CDIST_AVX512_H_ 4841+#define MINDSPORE_NNACL_FP32_CDIST_AVX512_H_ 4842+ 4843+#include "nnacl/intrinsics/ms_simd_instructions.h" 4844+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 4845+ 4846+#ifdef __cplusplus 4847+extern "C" { 4848+#endif 4849+#pragma GCC push_options 4850+#pragma GCC target("avx512f") 4851+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 4852+#define BLOCK_NUM 16 4853+#define MS_SIMD_AVX512 4854+ 4855+static inline int64_t CdistTwoNormalOptAVX512(int64_t index, const float *a, const float *b, 4856+ float *out, int64_t size) { 4857+ SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); 4858+ for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4859+ SIMD_F32 a_vec = SIMD_LD_F32(a + index); 4860+ SIMD_F32 b_vec = SIMD_LD_F32(b + index); 4861+ SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); 4862+ tmp_vec = SIMD_ABS_F32(tmp_vec); 4863+ result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec); 4864+ } 4865+ *out += SIMD_GET_SUM_F32(result_vec); 4866+ 4867+ return index; 4868+} 4869+ 4870+static inline int64_t CdistPNormalOptAVX512(int64_t index, const float *a, const float *b, 4871+ float *out, int64_t size, float p) { 4872+ SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); 4873+ SIMD_F32 p_vec = SIMD_MOV_F32(p); 4874+ for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4875+ SIMD_F32 a_vec = SIMD_LD_F32(a + index); 4876+ SIMD_F32 b_vec = SIMD_LD_F32(b + index); 4877+ SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); 4878+ tmp_vec = SIMD_ABS_F32(tmp_vec); 4879+ tmp_vec = SIMD_POW_F32(tmp_vec, p_vec); 4880+ result_vec = SIMD_ADD_F32(tmp_vec, result_vec); 4881+ } 4882+ *out += SIMD_GET_SUM_F32(result_vec); 4883+ 4884+ return index; 4885+} 4886+ 4887+#undef MS_SIMD_INSTRUCTION 4888+#undef BLOCK_NUM 4889+#pragma GCC pop_options 4890+#undef MS_SIMD_AVX512 4891+#ifdef __cplusplus 4892+} 4893+#endif 4894+#endif 4895diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h 4896new file mode 100644 4897index 00000000..f82adabf 4898--- /dev/null 4899+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h 4900@@ -0,0 +1,121 @@ 4901+/** 4902+ * Copyright 2022 Huawei Technologies Co., Ltd 4903+ * 4904+ * Licensed under the Apache License, Version 2.0 (the "License"); 4905+ * you may not use this file except in compliance with the License. 4906+ * You may obtain a copy of the License at 4907+ * 4908+ * http://www.apache.org/licenses/LICENSE-2.0 4909+ * 4910+ * Unless required by applicable law or agreed to in writing, software 4911+ * distributed under the License is distributed on an "AS IS" BASIS, 4912+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 4913+ * See the License for the specific language governing permissions and 4914+ * limitations under the License. 4915+ */ 4916+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_AVX512_H_ 4917+#define MINDSPORE_NNACL_FP32_CUMSUM_AVX512_H_ 4918+ 4919+#include "nnacl/intrinsics/ms_simd_instructions.h" 4920+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 4921+ 4922+#ifdef __cplusplus 4923+extern "C" { 4924+#endif 4925+#pragma GCC push_options 4926+#pragma GCC target("avx512f") 4927+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 4928+#define BLOCK_NUM 16 4929+#define MS_SIMD_AVX512 4930+ 4931+// (a, b, c) -> (a, a+b, a+b+c) exclusive == false 4932+// (a, b, c) -> (0, a, a+b) exclusive == true 4933+static inline int64_t CumsumOutputInitWithInputAVX512(int64_t index, const float *layer_input, 4934+ float *layer_output, int inner_dim) { 4935+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4936+ SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index)); 4937+ } 4938+ return index; 4939+} 4940+ 4941+static inline int64_t CumsumOutputInitWithZeroAVX512(int64_t index, float *layer_output, int inner_dim) { 4942+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4943+ SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f)); 4944+ } 4945+ return index; 4946+} 4947+ 4948+static inline int64_t CumsumAVX512(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output, 4949+ int inner_dim) { 4950+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4951+ SIMD_F32 input_val = SIMD_LD_F32(layer_input + index); 4952+ SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index); 4953+ SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); 4954+ SIMD_ST_F32(layer_output + index, out_val); 4955+ } 4956+ return index; 4957+} 4958+ 4959+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false 4960+// (a, b, c) -> (c+b, c, 0) exclusive==true 4961+static inline int64_t CumsumReverseAVX512(int64_t index, const float *layer_input, float *layer_output, 4962+ float *layer_last_output, int inner_dim) { 4963+ 4964+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4965+ SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1); 4966+ SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1); 4967+ SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); 4968+ SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val); 4969+ } 4970+ return index; 4971+} 4972+ 4973+// (a, b, c) -> (a, a+b, a+b+c) exclusive == false 4974+// (a, b, c) -> (0, a, a+b) exclusive == true 4975+static inline int64_t CumsumIntOutputInitWithInputAVX512(int64_t index, const int *layer_input, 4976+ int *layer_output, int inner_dim) { 4977+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4978+ SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index)); 4979+ } 4980+ return index; 4981+} 4982+ 4983+static inline int64_t CumsumIntOutputInitWithZeroAVX512(int64_t index, int *layer_output, int inner_dim) { 4984+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4985+ SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f)); 4986+ } 4987+ return index; 4988+} 4989+ 4990+static inline int64_t CumsumIntAVX512(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, 4991+ int inner_dim) { 4992+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 4993+ SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index); 4994+ SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index); 4995+ SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); 4996+ SIMD_ST_EPI32(layer_output + index, out_val); 4997+ } 4998+ return index; 4999+} 5000+ 5001+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false 5002+// (a, b, c) -> (c+b, c, 0) exclusive==true 5003+static inline int64_t CumsumReverseIntAVX512(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, 5004+ int inner_dim) { 5005+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5006+ SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1); 5007+ SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1); 5008+ SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); 5009+ SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val); 5010+ } 5011+ return index; 5012+} 5013+ 5014+#undef MS_SIMD_INSTRUCTION 5015+#undef BLOCK_NUM 5016+#pragma GCC pop_options 5017+#undef MS_SIMD_AVX512 5018+#ifdef __cplusplus 5019+} 5020+#endif 5021+#endif 5022diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h 5023new file mode 100644 5024index 00000000..4de588fb 5025--- /dev/null 5026+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h 5027@@ -0,0 +1,167 @@ 5028+/** 5029+ * Copyright 2022 Huawei Technologies Co., Ltd 5030+ * 5031+ * Licensed under the Apache License, Version 2.0 (the "License"); 5032+ * you may not use this file except in compliance with the License. 5033+ * You may obtain a copy of the License at 5034+ * 5035+ * http://www.apache.org/licenses/LICENSE-2.0 5036+ * 5037+ * Unless required by applicable law or agreed to in writing, software 5038+ * distributed under the License is distributed on an "AS IS" BASIS, 5039+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5040+ * See the License for the specific language governing permissions and 5041+ * limitations under the License. 5042+ */ 5043+ 5044+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX512_H_ 5045+#define MINDSPORE_NNACL_FP32_DIV_AVX512_H_ 5046+ 5047+#include "nnacl/intrinsics/ms_simd_instructions.h" 5048+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5049+ 5050+#ifdef __cplusplus 5051+extern "C" { 5052+#endif 5053+#pragma GCC push_options 5054+#pragma GCC target("avx512f") 5055+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5056+#define BLOCK_NUM 16 5057+#define MS_SIMD_AVX512 5058+ 5059+static inline int ElementOptDivNum0AVX512(int index, const float *in0, const float *in1, float *out, 5060+ int size) { 5061+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 5062+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5063+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5064+ SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1); 5065+ SIMD_ST_F32(out + index, vout); 5066+ } 5067+ return index; 5068+} 5069+ 5070+static inline int ElementOptDivNum1AVX512(int index, const float *in0, const float *in1, float *out, 5071+ int size) { 5072+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 5073+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5074+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5075+ SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_); 5076+ SIMD_ST_F32(out + index, vout); 5077+ } 5078+ return index; 5079+} 5080+ 5081+static inline int ElementOptDivIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { 5082+ SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); 5083+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5084+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 5085+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1); 5086+ SIMD_ST_EPI32(out + index, vout); 5087+ } 5088+ return index; 5089+} 5090+ 5091+static inline int ElementOptDivIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { 5092+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 5093+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5094+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 5095+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_); 5096+ SIMD_ST_EPI32(out + index, vout); 5097+ } 5098+ return index; 5099+} 5100+ 5101+static inline int ElementOptDivReluNum0AVX512(int index, const float *in0, const float *in1, float *out, 5102+ int size) { 5103+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 5104+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5105+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5106+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f); 5107+ SIMD_ST_F32(out + index, vout); 5108+ } 5109+ return index; 5110+} 5111+ 5112+static inline int ElementOptDivReluNum1AVX512(int index, const float *in0, const float *in1, float *out, 5113+ int size) { 5114+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 5115+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5116+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5117+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f); 5118+ SIMD_ST_F32(out + index, vout); 5119+ } 5120+ return index; 5121+} 5122+ 5123+static inline int ElementOptDivRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out, 5124+ int size) { 5125+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 5126+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5127+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5128+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f); 5129+ SIMD_ST_F32(out + index, vout); 5130+ } 5131+ return index; 5132+} 5133+ 5134+static inline int ElementOptDivRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out, 5135+ int size) { 5136+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 5137+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5138+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5139+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f); 5140+ SIMD_ST_F32(out + index, vout); 5141+ } 5142+ return index; 5143+} 5144+ 5145+static inline int ElementDivAVX512(int index, const float *in0, const float *in1, float *out, int size) { 5146+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5147+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5148+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5149+ SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1); 5150+ SIMD_ST_F32(out + index, vout); 5151+ } 5152+ return index; 5153+} 5154+ 5155+static inline int ElementDivIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { 5156+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5157+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 5158+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 5159+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1); 5160+ SIMD_ST_EPI32(out + index, vout); 5161+ } 5162+ return index; 5163+} 5164+ 5165+static inline int ElementDivReluAVX512(int index, const float *in0, const float *in1, float *out, 5166+ int size) { 5167+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5168+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5169+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5170+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f); 5171+ SIMD_ST_F32(out + index, vout); 5172+ } 5173+ return index; 5174+} 5175+ 5176+static inline int ElementDivRelu6AVX512(int index, const float *in0, const float *in1, float *out, 5177+ int size) { 5178+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5179+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5180+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5181+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f); 5182+ SIMD_ST_F32(out + index, vout); 5183+ } 5184+ return index; 5185+} 5186+ 5187+#undef MS_SIMD_INSTRUCTION 5188+#undef BLOCK_NUM 5189+#pragma GCC pop_options 5190+#undef MS_SIMD_AVX512 5191+#ifdef __cplusplus 5192+}; 5193+#endif 5194+#endif 5195diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h 5196new file mode 100644 5197index 00000000..eb847c23 5198--- /dev/null 5199+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h 5200@@ -0,0 +1,46 @@ 5201+/** 5202+ * Copyright 2022 Huawei Technologies Co., Ltd 5203+ * 5204+ * Licensed under the Apache License, Version 2.0 (the "License"); 5205+ * you may not use this file except in compliance with the License. 5206+ * You may obtain a copy of the License at 5207+ * 5208+ * http://www.apache.org/licenses/LICENSE-2.0 5209+ * 5210+ * Unless required by applicable law or agreed to in writing, software 5211+ * distributed under the License is distributed on an "AS IS" BASIS, 5212+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5213+ * See the License for the specific language governing permissions and 5214+ * limitations under the License. 5215+ */ 5216+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX512_H_ 5217+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX512_H_ 5218+ 5219+#include "nnacl/intrinsics/ms_simd_instructions.h" 5220+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5221+ 5222+#ifdef __cplusplus 5223+extern "C" { 5224+#endif 5225+#pragma GCC push_options 5226+#pragma GCC target("avx512f") 5227+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5228+#define BLOCK_NUM 16 5229+#define MS_SIMD_AVX512 5230+ 5231+static inline int DropoutFp32AVX512(int index, const float *input, float scale, 5232+ int length, float *output) { 5233+ SIMD_F32 scale_value = SIMD_MOV_F32(scale); 5234+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5235+ SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value)); 5236+ } 5237+ return index; 5238+} 5239+#undef MS_SIMD_INSTRUCTION 5240+#undef BLOCK_NUM 5241+#pragma GCC pop_options 5242+#undef MS_SIMD_AVX512 5243+#ifdef __cplusplus 5244+} 5245+#endif 5246+#endif 5247diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h 5248new file mode 100644 5249index 00000000..14386f5f 5250--- /dev/null 5251+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h 5252@@ -0,0 +1,63 @@ 5253+/** 5254+ * Copyright 2022 Huawei Technologies Co., Ltd 5255+ * 5256+ * Licensed under the Apache License, Version 2.0 (the "License"); 5257+ * you may not use this file except in compliance with the License. 5258+ * You may obtain a copy of the License at 5259+ * 5260+ * http://www.apache.org/licenses/LICENSE-2.0 5261+ * 5262+ * Unless required by applicable law or agreed to in writing, software 5263+ * distributed under the License is distributed on an "AS IS" BASIS, 5264+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5265+ * See the License for the specific language governing permissions and 5266+ * limitations under the License. 5267+ */ 5268+ 5269+#ifndef MINDSPORE_NNACL_FP32_DIV_AVX512_H_ 5270+#define MINDSPORE_NNACL_FP32_DIV_AVX512_H_ 5271+ 5272+#include "nnacl/intrinsics/ms_simd_instructions.h" 5273+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5274+ 5275+#ifdef __cplusplus 5276+extern "C" { 5277+#endif 5278+#pragma GCC push_options 5279+#pragma GCC target("avx512f") 5280+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5281+#define BLOCK_NUM 16 5282+#define MS_SIMD_AVX512 5283+ 5284+static inline int64_t ExpFp32AVX512(int64_t index, const float *src, float *dst, int num) { 5285+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5286+ SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); 5287+ } 5288+ return index; 5289+} 5290+ 5291+static inline int64_t ExpFp32WithInScaleAVX512(int64_t index, const float *src, float *dst, int num, float in_scale) { 5292+ SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale); 5293+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5294+ SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index); 5295+ } 5296+ return index; 5297+} 5298+ 5299+static inline int64_t ExpFp32WithOutScaleAVX512(int64_t index, const float *src, float *dst, int num, float out_scale) { 5300+ SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale); 5301+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5302+ SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); 5303+ SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec)); 5304+ } 5305+ return index; 5306+} 5307+ 5308+#undef MS_SIMD_INSTRUCTION 5309+#undef BLOCK_NUM 5310+#pragma GCC pop_options 5311+#undef MS_SIMD_AVX512 5312+#ifdef __cplusplus 5313+}; 5314+#endif 5315+#endif 5316diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h 5317new file mode 100644 5318index 00000000..5eb04746 5319--- /dev/null 5320+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h 5321@@ -0,0 +1,53 @@ 5322+/** 5323+ * Copyright 2022 Huawei Technologies Co., Ltd 5324+ * 5325+ * Licensed under the Apache License, Version 2.0 (the "License"); 5326+ * you may not use this file except in compliance with the License. 5327+ * You may obtain a copy of the License at 5328+ * 5329+ * http://www.apache.org/licenses/LICENSE-2.0 5330+ * 5331+ * Unless required by applicable law or agreed to in writing, software 5332+ * distributed under the License is distributed on an "AS IS" BASIS, 5333+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5334+ * See the License for the specific language governing permissions and 5335+ * limitations under the License. 5336+ */ 5337+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_AVX512_H_ 5338+#define MINDSPORE_NNACL_BASE_FILL_BASE_AVX512_H_ 5339+ 5340+#include "nnacl/intrinsics/ms_simd_instructions.h" 5341+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5342+ 5343+#ifdef __cplusplus 5344+extern "C" { 5345+#endif 5346+#pragma GCC push_options 5347+#pragma GCC target("avx512f") 5348+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5349+#define BLOCK_NUM 16 5350+#define MS_SIMD_AVX512 5351+ 5352+static inline int FillFp32AVX512(int index, float *output, int size, float data) { 5353+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5354+ SIMD_ST_F32(output + index, SIMD_MOV_F32(data)); 5355+ } 5356+ return index; 5357+} 5358+ 5359+static inline int FillInt32AVX512(int index, int *output, int size, int data) { 5360+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5361+ SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data)); 5362+ } 5363+ return index; 5364+} 5365+ 5366+#undef MS_SIMD_INSTRUCTION 5367+#undef BLOCK_NUM 5368+#pragma GCC pop_options 5369+#undef MS_SIMD_AVX512 5370+#ifdef __cplusplus 5371+} 5372+#endif 5373+#endif 5374+ 5375diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h 5376new file mode 100644 5377index 00000000..f26537d9 5378--- /dev/null 5379+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h 5380@@ -0,0 +1,77 @@ 5381+/** 5382+ * Copyright 2022 Huawei Technologies Co., Ltd 5383+ * 5384+ * Licensed under the Apache License, Version 2.0 (the "License"); 5385+ * you may not use this file except in compliance with the License. 5386+ * You may obtain a copy of the License at 5387+ * 5388+ * http://www.apache.org/licenses/LICENSE-2.0 5389+ * 5390+ * Unless required by applicable law or agreed to in writing, software 5391+ * distributed under the License is distributed on an "AS IS" BASIS, 5392+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5393+ * See the License for the specific language governing permissions and 5394+ * limitations under the License. 5395+ */ 5396+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX512_H_ 5397+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX512_H_ 5398+ 5399+#include "nnacl/intrinsics/ms_simd_instructions.h" 5400+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5401+ 5402+#ifdef __cplusplus 5403+extern "C" { 5404+#endif 5405+#pragma GCC push_options 5406+#pragma GCC target("avx512f") 5407+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5408+#define BLOCK_NUM 16 5409+#define MS_SIMD_AVX512 5410+ 5411+static inline int64_t GroupNormFp32AVX512(int64_t index, const float *unit_input, float scale, float offset, float mean, 5412+ float var_sqrt, int unit, float *unit_output) { 5413+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 5414+ SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt); 5415+ SIMD_F32 scale_val = SIMD_MOV_F32(scale); 5416+ SIMD_F32 offset_val = SIMD_MOV_F32(offset); 5417+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5418+ SIMD_F32 input = SIMD_LD_F32(unit_input + index); 5419+ SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt); 5420+ SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val); 5421+ SIMD_ST_F32(unit_output + index, output); 5422+ } 5423+ return index; 5424+} 5425+ 5426+static inline int64_t GroupNormReduceSumAVX512(int64_t index, const float *in, float *sum, int unit) { 5427+ if (unit - index >= 4 * BLOCK_NUM) { 5428+ SIMD_F32 tmp = SIMD_MOV_F32(0); 5429+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5430+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index)); 5431+ } 5432+ *sum += SIMD_GET_SUM_F32(tmp); 5433+ } 5434+ return index; 5435+} 5436+ 5437+static inline int64_t GroupNormReduceVarAVX512(int64_t index, const float *in, float mean, float *sum, int unit) { 5438+ if (unit - index >= 4 * BLOCK_NUM) { 5439+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 5440+ SIMD_F32 tmp = SIMD_MOV_F32(0); 5441+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5442+ SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val); 5443+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input)); 5444+ } 5445+ *sum += SIMD_GET_SUM_F32(tmp); 5446+ } 5447+ return index; 5448+} 5449+ 5450+#undef MS_SIMD_INSTRUCTION 5451+#undef BLOCK_NUM 5452+#pragma GCC pop_options 5453+#undef MS_SIMD_AVX512 5454+#ifdef __cplusplus 5455+} 5456+#endif 5457+#endif 5458diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h 5459new file mode 100644 5460index 00000000..e5fb6d7b 5461--- /dev/null 5462+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h 5463@@ -0,0 +1,68 @@ 5464+/** 5465+ * Copyright 2022 Huawei Technologies Co., Ltd 5466+ * 5467+ * Licensed under the Apache License, Version 2.0 (the "License"); 5468+ * you may not use this file except in compliance with the License. 5469+ * You may obtain a copy of the License at 5470+ * 5471+ * http://www.apache.org/licenses/LICENSE-2.0 5472+ * 5473+ * Unless required by applicable law or agreed to in writing, software 5474+ * distributed under the License is distributed on an "AS IS" BASIS, 5475+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5476+ * See the License for the specific language governing permissions and 5477+ * limitations under the License. 5478+ */ 5479+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX512_H_ 5480+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX512_H_ 5481+ 5482+#include "nnacl/intrinsics/ms_simd_instructions.h" 5483+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5484+ 5485+#ifdef __cplusplus 5486+extern "C" { 5487+#endif 5488+#pragma GCC push_options 5489+#pragma GCC target("avx512f") 5490+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5491+#define BLOCK_NUM 16 5492+#define MS_SIMD_AVX512 5493+ 5494+static inline int LayerNormMeanAndSquareAVX512(int index, const float *src, int num, float *mean, float *square_mean) { 5495+ if (num >= 4 * BLOCK_NUM) { 5496+ SIMD_F32 sum_val = SIMD_SET0_F32; 5497+ SIMD_F32 square_sum_val = SIMD_SET0_F32; 5498+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5499+ SIMD_F32 value = SIMD_LD_F32(src + index); 5500+ SIMD_F32 square_value = SIMD_MUL_F32(value, value); 5501+ sum_val = SIMD_ADD_F32(sum_val, value); 5502+ square_sum_val = SIMD_ADD_F32(square_sum_val, square_value); 5503+ } 5504+ *mean += SIMD_GET_SUM_F32(sum_val); 5505+ *square_mean += SIMD_GET_SUM_F32(square_sum_val); 5506+ } 5507+ return index; 5508+} 5509+ 5510+static inline int LayerNormGammaAndBetaAVX512(int index, float *dst, const float *src, const float *gamma_data, 5511+ const float *beta_data, int num, const float mean, const float deno) { 5512+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 5513+ SIMD_F32 deno_val = SIMD_MOV_F32(deno); 5514+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5515+ SIMD_F32 value = SIMD_LD_F32(src + index); 5516+ SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val); 5517+ out_value = SIMD_MUL_F32(out_value, deno_val); 5518+ out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index)); 5519+ SIMD_ST_F32(dst + index, out_value); 5520+ } 5521+ return index; 5522+} 5523+ 5524+#undef MS_SIMD_INSTRUCTION 5525+#undef BLOCK_NUM 5526+#pragma GCC pop_options 5527+#undef MS_SIMD_AVX512 5528+#ifdef __cplusplus 5529+} 5530+#endif 5531+#endif 5532diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h 5533new file mode 100644 5534index 00000000..d51779d4 5535--- /dev/null 5536+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h 5537@@ -0,0 +1,93 @@ 5538+/** 5539+ * Copyright 2022 Huawei Technologies Co., Ltd 5540+ * 5541+ * Licensed under the Apache License, Version 2.0 (the "License"); 5542+ * you may not use this file except in compliance with the License. 5543+ * You may obtain a copy of the License at 5544+ * 5545+ * http://www.apache.org/licenses/LICENSE-2.0 5546+ * 5547+ * Unless required by applicable law or agreed to in writing, software 5548+ * distributed under the License is distributed on an "AS IS" BASIS, 5549+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5550+ * See the License for the specific language governing permissions and 5551+ * limitations under the License. 5552+ */ 5553+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_AVX512_H_ 5554+#define MINDSPORE_NNACL_FP32_MATMUL_F32_AVX512_H_ 5555+ 5556+#include "nnacl/intrinsics/ms_simd_instructions.h" 5557+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5558+ 5559+#ifdef __cplusplus 5560+extern "C" { 5561+#endif 5562+#pragma GCC push_options 5563+#pragma GCC target("avx512f") 5564+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5565+#define BLOCK_NUM 16 5566+#define MS_SIMD_AVX512 5567+ 5568+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6. 5569+static inline int64_t GemmIsNotPackAVX512(int64_t index, const float *a, const float *b, float *c, const float *bias, int row, 5570+ int deep, int act_type) { 5571+ SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f); 5572+ SIMD_F32 up_threshold = SIMD_MOV_F32(6); 5573+ SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]); 5574+ SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]); 5575+ for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5576+ SIMD_F32 a_data = SIMD_LD_F32(a + index); 5577+ SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16); 5578+ if (act_type != 0) { 5579+ dst = SIMD_MAX_F32(dst, down_threshold); 5580+ if (act_type == 3) { 5581+ dst = SIMD_MIN_F32(dst, up_threshold); 5582+ } 5583+ } 5584+ SIMD_ST_F32(c + index, dst); 5585+ } 5586+ 5587+ return index; 5588+} 5589+ 5590+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX) 5591+static inline int64_t GemmIsNotPackOptimizeCoreAVX512(int64_t index, const float *a, const float *b, int k, float *dst) { 5592+ SIMD_F32 dst1 = SIMD_MOV_F32(0.0f); 5593+ for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5594+ SIMD_F32 weight = SIMD_LD_F32(b + index); 5595+ SIMD_F32 a1 = SIMD_LD_F32(a + index); 5596+ dst1 = SIMD_FMADD_F32(weight, a1, dst1); 5597+ } 5598+ *dst += SIMD_REDUCE_ADD_F32(dst1); 5599+ return index; 5600+} 5601+#endif 5602+ 5603+static inline int64_t MatVecMulNoPackCoreAVX512(int64_t oc_index, const float *a, const float *b, float *c, const float *bias, 5604+ int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) { 5605+ for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) { 5606+ SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index)); 5607+ for (int64_t k = 0; k < depth; ++k) { 5608+ SIMD_F32 left = SIMD_MOV_F32(a[k]); 5609+ SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col); 5610+ out = SIMD_FMADD_F32(left, right, out); 5611+ } 5612+ if ((inc_flag & 0x2) != 0 && act_type != 0) { 5613+ out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f)); 5614+ if (act_type == 0x3) { 5615+ out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f)); 5616+ } 5617+ } 5618+ SIMD_ST_F32(c + oc_index, out); 5619+ } 5620+ return oc_index; 5621+} 5622+ 5623+#undef MS_SIMD_INSTRUCTION 5624+#undef BLOCK_NUM 5625+#pragma GCC pop_options 5626+#undef MS_SIMD_AVX512 5627+#ifdef __cplusplus 5628+} 5629+#endif 5630+#endif 5631diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h 5632new file mode 100644 5633index 00000000..e3b242e4 5634--- /dev/null 5635+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h 5636@@ -0,0 +1,218 @@ 5637+/** 5638+ * Copyright 2022 Huawei Technologies Co., Ltd 5639+ * 5640+ * Licensed under the Apache License, Version 2.0 (the "License"); 5641+ * you may not use this file except in compliance with the License. 5642+ * You may obtain a copy of the License at 5643+ * 5644+ * http://www.apache.org/licenses/LICENSE-2.0 5645+ * 5646+ * Unless required by applicable law or agreed to in writing, software 5647+ * distributed under the License is distributed on an "AS IS" BASIS, 5648+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5649+ * See the License for the specific language governing permissions and 5650+ * limitations under the License. 5651+ */ 5652+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ 5653+#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ 5654+ 5655+#include "nnacl/intrinsics/ms_simd_instructions.h" 5656+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5657+ 5658+#ifdef __cplusplus 5659+extern "C" { 5660+#endif 5661+#pragma GCC push_options 5662+#pragma GCC target("avx512f") 5663+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5664+#define BLOCK_NUM 16 5665+#define MS_SIMD_AVX512 5666+ 5667+static inline int ElementMulAVX512(int index, const float *in0, const float *in1, float *out, int size) { 5668+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5669+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5670+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5671+ SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1); 5672+ SIMD_ST_F32(out + index, vout); 5673+ } 5674+ return index; 5675+} 5676+ 5677+static inline int ElementMulReluAVX512(int index, const float *in0, const float *in1, float *out, int size) { 5678+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5679+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5680+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5681+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f); 5682+ SIMD_ST_F32(out + index, vout); 5683+ } 5684+ return index; 5685+} 5686+ 5687+static inline int ElementMulRelu6AVX512(int index, const float *in0, const float *in1, float *out, int size) { 5688+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5689+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5690+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5691+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f); 5692+ SIMD_ST_F32(out + index, vout); 5693+ } 5694+ return index; 5695+} 5696+ 5697+static inline int ElementMulIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { 5698+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5699+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 5700+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 5701+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1); 5702+ SIMD_ST_EPI32(out + index, vout); 5703+ } 5704+ return index; 5705+} 5706+ 5707+static inline int ElementMulReluIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { 5708+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5709+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 5710+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 5711+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f); 5712+ SIMD_ST_EPI32(out + index, vout); 5713+ } 5714+ return index; 5715+} 5716+ 5717+static inline int ElementMulRelu6IntAVX512(int index, const int *in0, const int *in1, int *out, int size) { 5718+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5719+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 5720+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 5721+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f); 5722+ SIMD_ST_EPI32(out + index, vout); 5723+ } 5724+ return index; 5725+} 5726+ 5727+static inline int ElementOptMulNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { 5728+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 5729+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5730+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5731+ SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1); 5732+ SIMD_ST_F32(out + index, vout); 5733+ } 5734+ return index; 5735+} 5736+ 5737+static inline int ElementOptMulNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { 5738+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 5739+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5740+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5741+ SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_); 5742+ SIMD_ST_F32(out + index, vout); 5743+ } 5744+ return index; 5745+} 5746+ 5747+static inline int ElementOptMulReluNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { 5748+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 5749+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5750+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5751+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f); 5752+ SIMD_ST_F32(out + index, vout); 5753+ } 5754+ return index; 5755+} 5756+ 5757+static inline int ElementOptMulReluNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { 5758+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 5759+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5760+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5761+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f); 5762+ SIMD_ST_F32(out + index, vout); 5763+ } 5764+ return index; 5765+} 5766+ 5767+static inline int ElementOptMulRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out, int size) { 5768+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 5769+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5770+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 5771+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f); 5772+ SIMD_ST_F32(out + index, vout); 5773+ } 5774+ return index; 5775+} 5776+ 5777+static inline int ElementOptMulRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out, int size) { 5778+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 5779+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5780+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 5781+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f); 5782+ SIMD_ST_F32(out + index, vout); 5783+ } 5784+ return index; 5785+} 5786+ 5787+static inline int ElementOptMulIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { 5788+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 5789+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5790+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 5791+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1); 5792+ SIMD_ST_EPI32(out + index, vout); 5793+ } 5794+ return index; 5795+} 5796+ 5797+static inline int ElementOptMulIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { 5798+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 5799+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5800+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 5801+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_); 5802+ SIMD_ST_EPI32(out + index, vout); 5803+ } 5804+ return index; 5805+} 5806+ 5807+static inline int ElementOptMulReluIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { 5808+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 5809+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5810+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 5811+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f); 5812+ SIMD_ST_EPI32(out + index, vout); 5813+ } 5814+ return index; 5815+} 5816+ 5817+static inline int ElementOptMulReluIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { 5818+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 5819+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5820+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 5821+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f); 5822+ SIMD_ST_EPI32(out + index, vout); 5823+ } 5824+ return index; 5825+} 5826+ 5827+static inline int ElementOptMulRelu6IntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { 5828+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 5829+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5830+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 5831+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f); 5832+ SIMD_ST_EPI32(out + index, vout); 5833+ } 5834+ return index; 5835+} 5836+ 5837+static inline int ElementOptMulRelu6IntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { 5838+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 5839+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5840+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 5841+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f); 5842+ SIMD_ST_EPI32(out + index, vout); 5843+ } 5844+ return index; 5845+} 5846+ 5847+#undef MS_SIMD_INSTRUCTION 5848+#undef BLOCK_NUM 5849+#pragma GCC pop_options 5850+#undef MS_SIMD_AVX512 5851+#ifdef __cplusplus 5852+} 5853+#endif 5854+#endif 5855diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h 5856new file mode 100644 5857index 00000000..d1e001ee 5858--- /dev/null 5859+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h 5860@@ -0,0 +1,84 @@ 5861+/** 5862+ * Copyright 2022 Huawei Technologies Co., Ltd 5863+ * 5864+ * Licensed under the Apache License, Version 2.0 (the "License"); 5865+ * you may not use this file except in compliance with the License. 5866+ * You may obtain a copy of the License at 5867+ * 5868+ * http://www.apache.org/licenses/LICENSE-2.0 5869+ * 5870+ * Unless required by applicable law or agreed to in writing, software 5871+ * distributed under the License is distributed on an "AS IS" BASIS, 5872+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5873+ * See the License for the specific language governing permissions and 5874+ * limitations under the License. 5875+ */ 5876+#ifndef MINDSPORE_NNACL_FP32_POOLING_AVX512_H_ 5877+#define MINDSPORE_NNACL_FP32_POOLING_AVX512_H_ 5878+ 5879+#include "nnacl/intrinsics/ms_simd_instructions.h" 5880+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5881+ 5882+#ifdef __cplusplus 5883+extern "C" { 5884+#endif 5885+#pragma GCC push_options 5886+#pragma GCC target("avx512f") 5887+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5888+#define BLOCK_NUM 16 5889+#define MS_SIMD_AVX512 5890+ 5891+static inline int AvgPoolingBatchAVX512(int ci, const float *src_plane_ptr, int channel, 5892+ float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, 5893+ int in_h_index, int in_w, int in_w_index, float minf, float maxf) { 5894+ SIMD_F32 min_val = SIMD_MOV_F32(minf); 5895+ SIMD_F32 max_val = SIMD_MOV_F32(maxf); 5896+ for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { 5897+ const float *src_c_ptr = src_plane_ptr + ci; 5898+ float *dst_c_ptr = dst_plane_ptr + ci; 5899+ SIMD_F32 tmp_avg = SIMD_SET0_F32; 5900+ int real_count = 0; 5901+ for (int h = real_win_h_start; h < real_win_h_end; h++) { 5902+ for (int w = real_win_w_start; w < real_win_w_end; w++) { 5903+ const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; 5904+ tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr)); 5905+ ++real_count; 5906+ } 5907+ } 5908+ tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count)); 5909+ tmp_avg = SIMD_MAX_F32(tmp_avg, min_val); 5910+ tmp_avg = SIMD_MIN_F32(tmp_avg, max_val); 5911+ SIMD_ST_F32(dst_c_ptr, tmp_avg); 5912+ } 5913+ return ci; 5914+} 5915+ 5916+static inline int MaxPoolingBatchAVX512(int ci, const float *src_plane_ptr, int channel, 5917+ float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, 5918+ int in_h_index, int in_w, int in_w_index, float minf, float maxf) { 5919+ SIMD_F32 min_val = SIMD_MOV_F32(minf); 5920+ SIMD_F32 max_val = SIMD_MOV_F32(maxf); 5921+ for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { 5922+ const float *src_c_ptr = src_plane_ptr + ci; 5923+ float *dst_c_ptr = dst_plane_ptr + ci; 5924+ SIMD_F32 tmp_max = min_val; 5925+ for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { 5926+ for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { 5927+ const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; 5928+ tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr)); 5929+ } 5930+ } 5931+ tmp_max = SIMD_MIN_F32(tmp_max, max_val); 5932+ SIMD_ST_F32(dst_c_ptr, tmp_max); 5933+ } 5934+ return ci; 5935+} 5936+ 5937+#undef MS_SIMD_INSTRUCTION 5938+#undef BLOCK_NUM 5939+#pragma GCC pop_options 5940+#undef MS_SIMD_AVX512 5941+#ifdef __cplusplus 5942+} 5943+#endif 5944+#endif 5945diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h 5946new file mode 100644 5947index 00000000..a31eaf2f 5948--- /dev/null 5949+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h 5950@@ -0,0 +1,101 @@ 5951+/** 5952+ * Copyright 2022 Huawei Technologies Co., Ltd 5953+ * 5954+ * Licensed under the Apache License, Version 2.0 (the "License"); 5955+ * you may not use this file except in compliance with the License. 5956+ * You may obtain a copy of the License at 5957+ * 5958+ * http://www.apache.org/licenses/LICENSE-2.0 5959+ * 5960+ * Unless required by applicable law or agreed to in writing, software 5961+ * distributed under the License is distributed on an "AS IS" BASIS, 5962+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 5963+ * See the License for the specific language governing permissions and 5964+ * limitations under the License. 5965+ */ 5966+#ifndef MINDSPORE_NNACL_FP32_POWER_AVX512_H_ 5967+#define MINDSPORE_NNACL_FP32_POWER_AVX512_H_ 5968+ 5969+#include "nnacl/intrinsics/ms_simd_instructions.h" 5970+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 5971+ 5972+#ifdef __cplusplus 5973+extern "C" { 5974+#endif 5975+#pragma GCC push_options 5976+#pragma GCC target("avx512f") 5977+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 5978+#define BLOCK_NUM 16 5979+#define MS_SIMD_AVX512 5980+ 5981+static inline int PowerBroadCastIntExponentAVX512(int index, const float *input, int exponent, float *output, int len, 5982+ float scale, float shift) { 5983+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 5984+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 5985+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 5986+ SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 5987+ SIMD_F32 result = SIMD_MOV_F32(1.0f); 5988+ int exp = abs(exponent); 5989+ while (exp) { 5990+ if (exp % 2) { 5991+ result = SIMD_MUL_F32(result, tmp); 5992+ } 5993+ tmp = SIMD_MUL_SQUARE_F32(tmp); 5994+ exp = exp / 2; 5995+ } 5996+ SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result)); 5997+ } 5998+ return index; 5999+} 6000+ 6001+static inline int PowerBroadCastFloatExponentAVX512(int index, const float *input, float exponent, float *output, int len, 6002+ float scale, float shift) { 6003+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 6004+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 6005+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6006+ SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 6007+ SIMD_F32 result; 6008+ for (int i = 0; i < BLOCK_NUM; ++i) { 6009+ SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent); 6010+ } 6011+ SIMD_ST_F32(output + index, result); 6012+ } 6013+ return index; 6014+} 6015+ 6016+static inline int PowerSingleExponentAVX512(int index, const float *input, const float *exponent, float *output, int len, 6017+ float scale, float shift) { 6018+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 6019+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 6020+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6021+ SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 6022+ for (int j = 0; j < BLOCK_NUM; ++j) { 6023+ float cur_exponent = exponent[index + j]; 6024+ float cur_val = SIMD_F32_GETI(tmp_vec, j); 6025+ if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) { 6026+ int exp = abs((int)(cur_exponent)); 6027+ float result = 1; 6028+ while (exp) { 6029+ if (exp % 2) { 6030+ result *= cur_val; 6031+ } 6032+ cur_val *= cur_val; 6033+ exp = exp / 2; 6034+ } 6035+ output[index + j] = *exponent >= 0 ? result : 1 / result; 6036+ } else { 6037+ output[index + j] = powf(cur_val, cur_exponent); 6038+ } 6039+ } 6040+ } 6041+ return index; 6042+} 6043+ 6044+#undef MS_SIMD_INSTRUCTION 6045+#undef BLOCK_NUM 6046+#pragma GCC pop_options 6047+#undef MS_SIMD_AVX512 6048+#ifdef __cplusplus 6049+} 6050+#endif 6051+#endif 6052diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h 6053new file mode 100644 6054index 00000000..5885a044 6055--- /dev/null 6056+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h 6057@@ -0,0 +1,181 @@ 6058+/** 6059+ * Copyright 2022 Huawei Technologies Co., Ltd 6060+ * 6061+ * Licensed under the Apache License, Version 2.0 (the "License"); 6062+ * you may not use this file except in compliance with the License. 6063+ * You may obtain a copy of the License at 6064+ * 6065+ * http://www.apache.org/licenses/LICENSE-2.0 6066+ * 6067+ * Unless required by applicable law or agreed to in writing, software 6068+ * distributed under the License is distributed on an "AS IS" BASIS, 6069+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6070+ * See the License for the specific language governing permissions and 6071+ * limitations under the License. 6072+ */ 6073+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX512_H_ 6074+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX512_H_ 6075+ 6076+#include "nnacl/intrinsics/ms_simd_instructions.h" 6077+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 6078+ 6079+#ifdef __cplusplus 6080+extern "C" { 6081+#endif 6082+#pragma GCC push_options 6083+#pragma GCC target("avx512f") 6084+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 6085+#define BLOCK_NUM 16 6086+#define MS_SIMD_AVX512 6087+ 6088+static inline int64_t ReduceSumAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 6089+ int axis_size) { 6090+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6091+ const float *inner_src = outer_src + index; 6092+ SIMD_F32 tmp = SIMD_MOV_F32(0); 6093+ for (int i = 0; i < axis_size; i++) { 6094+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 6095+ } 6096+ SIMD_ST_F32(outer_dst + index, tmp); 6097+ } 6098+ return index; 6099+} 6100+ 6101+static inline int64_t ReduceMeanAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 6102+ int axis_size) { 6103+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6104+ const float *inner_src = outer_src + index; 6105+ SIMD_F32 tmp = SIMD_MOV_F32(0); 6106+ for (int i = 0; i < axis_size; i++) { 6107+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 6108+ } 6109+ SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size)); 6110+ } 6111+ return index; 6112+} 6113+ 6114+static inline int64_t ReduceMinAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 6115+ int axis_size) { 6116+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6117+ const float *inner_src = outer_src + index; 6118+ SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX); 6119+ for (int i = 0; i < axis_size; i++) { 6120+ tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 6121+ } 6122+ SIMD_ST_F32(outer_dst + index, tmp); 6123+ } 6124+ return index; 6125+} 6126+ 6127+static inline int64_t ReduceMaxAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 6128+ int axis_size) { 6129+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6130+ const float *inner_src = outer_src + index; 6131+ SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN); 6132+ for (int i = 0; i < axis_size; i++) { 6133+ tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 6134+ } 6135+ SIMD_ST_F32(outer_dst + index, tmp); 6136+ } 6137+ return index; 6138+} 6139+ 6140+static inline int64_t ReduceProdAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 6141+ int axis_size) { 6142+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6143+ const float *inner_src = outer_src + index; 6144+ SIMD_F32 tmp = SIMD_MOV_F32(1.0f); 6145+ for (int i = 0; i < axis_size; i++) { 6146+ tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 6147+ } 6148+ SIMD_ST_F32(outer_dst + index, tmp); 6149+ } 6150+ return index; 6151+} 6152+ 6153+static inline int64_t ReduceSumSquareAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 6154+ int axis_size) { 6155+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6156+ const float *inner_src = outer_src + index; 6157+ SIMD_F32 tmp = SIMD_MOV_F32(0); 6158+ for (int i = 0; i < axis_size; i++) { 6159+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); 6160+ } 6161+ SIMD_ST_F32(outer_dst + index, tmp); 6162+ } 6163+ return index; 6164+} 6165+ 6166+static inline int64_t ReduceL2NormAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 6167+ int axis_size) { 6168+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6169+ const float *inner_src = outer_src + index; 6170+ SIMD_F32 tmp = SIMD_MOV_F32(0); 6171+ for (int i = 0; i < axis_size; i++) { 6172+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); 6173+ } 6174+ SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp)); 6175+ } 6176+ return index; 6177+} 6178+ 6179+static inline int64_t IntReduceSumAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 6180+ int axis_size) { 6181+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6182+ const int *inner_src = outer_src + index; 6183+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); 6184+ for (int i = 0; i < axis_size; i++) { 6185+ tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 6186+ } 6187+ SIMD_ST_EPI32(outer_dst + index, tmp); 6188+ } 6189+ return index; 6190+} 6191+ 6192+static inline int64_t IntReduceMeanAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 6193+ int axis_size) { 6194+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6195+ const int *inner_src = outer_src + index; 6196+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); 6197+ for (int i = 0; i < axis_size; i++) { 6198+ tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 6199+ } 6200+ SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size)); 6201+ } 6202+ return index; 6203+} 6204+ 6205+static inline int64_t IntReduceMinAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 6206+ int axis_size) { 6207+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6208+ const int *inner_src = outer_src + index; 6209+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX); 6210+ for (int i = 0; i < axis_size; i++) { 6211+ tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 6212+ } 6213+ SIMD_ST_EPI32(outer_dst + index, tmp); 6214+ } 6215+ return index; 6216+} 6217+ 6218+static inline int64_t IntReduceMaxAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 6219+ int axis_size) { 6220+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6221+ const int *inner_src = outer_src + index; 6222+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN); 6223+ for (int i = 0; i < axis_size; i++) { 6224+ tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 6225+ } 6226+ SIMD_ST_EPI32(outer_dst + index, tmp); 6227+ } 6228+ return index; 6229+} 6230+ 6231+#undef MS_SIMD_INSTRUCTION 6232+#undef BLOCK_NUM 6233+#pragma GCC pop_options 6234+#undef MS_SIMD_AVX512 6235+#ifdef __cplusplus 6236+} 6237+#endif 6238+#endif 6239diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h 6240new file mode 100644 6241index 00000000..1fa1907e 6242--- /dev/null 6243+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h 6244@@ -0,0 +1,87 @@ 6245+/** 6246+ * Copyright 2022 Huawei Technologies Co., Ltd 6247+ * 6248+ * Licensed under the Apache License, Version 2.0 (the "License"); 6249+ * you may not use this file except in compliance with the License. 6250+ * You may obtain a copy of the License at 6251+ * 6252+ * http://www.apache.org/licenses/LICENSE-2.0 6253+ * 6254+ * Unless required by applicable law or agreed to in writing, software 6255+ * distributed under the License is distributed on an "AS IS" BASIS, 6256+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6257+ * See the License for the specific language governing permissions and 6258+ * limitations under the License. 6259+ */ 6260+ 6261+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_AVX512_H_ 6262+#define MINDSPORE_NNACL_FP32_SOFTMAX_AVX512_H_ 6263+ 6264+#include "nnacl/intrinsics/ms_simd_instructions.h" 6265+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 6266+ 6267+#ifdef __cplusplus 6268+extern "C" { 6269+#endif 6270+#pragma GCC push_options 6271+#pragma GCC target("avx512f") 6272+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 6273+#define BLOCK_NUM 16 6274+#define MS_SIMD_AVX512 6275+ 6276+static inline int64_t SoftmaxNormGetMaxAVX512(int64_t index, const float *src, int cur_batch_offset, 6277+ float *max, int channel) { 6278+ if (channel >= BLOCK_NUM * BLOCK_NUM) { 6279+ SIMD_F32 max_val = SIMD_MOV_F32(*max); 6280+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6281+ max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index)); 6282+ } 6283+ *max = SIMD_GET_MAX_F32(max_val); 6284+ } 6285+ return index; 6286+} 6287+ 6288+static inline int64_t SoftmaxNormCalcNormAVX512(int64_t index, const float *src, float *dst, 6289+ int cur_batch_offset, float max, int channel) { 6290+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6291+ SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max)); 6292+ SIMD_ST_F32(dst + cur_batch_offset + index, output); 6293+ } 6294+ return index; 6295+} 6296+ 6297+static inline int64_t SoftmaxLastAxisGetExpSumAVX512(int64_t index, const float *src, float *dst, 6298+ int cur_batch_offset, float max, float *exp_sum, int channel) { 6299+#ifndef _WIN32 6300+ SIMD_F32 sum_val = SIMD_SET0_F32; 6301+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6302+ SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); 6303+ SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max)); 6304+ SIMD_F32 exp_out = SIMD_EXP_F32(output); 6305+ sum_val = SIMD_ADD_F32(sum_val, exp_out); 6306+ SIMD_ST_F32(dst + cur_batch_offset + index, exp_out); 6307+ } 6308+ *exp_sum += SIMD_GET_SUM_F32(sum_val); 6309+#endif 6310+ return index; 6311+} 6312+ 6313+static inline int64_t SoftmaxLastAxisGetResultAVX512(int64_t index, const float *src, float *dst, 6314+ int cur_batch_offset, float exp_sum, int channel) { 6315+ SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum); 6316+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6317+ SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); 6318+ SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val); 6319+ SIMD_ST_F32(dst + cur_batch_offset + index, output); 6320+ } 6321+ return index; 6322+} 6323+ 6324+#undef MS_SIMD_INSTRUCTION 6325+#undef BLOCK_NUM 6326+#pragma GCC pop_options 6327+#undef MS_SIMD_AVX512 6328+#ifdef __cplusplus 6329+}; 6330+#endif 6331+#endif 6332diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h 6333new file mode 100644 6334index 00000000..994fc7c0 6335--- /dev/null 6336+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h 6337@@ -0,0 +1,167 @@ 6338+/** 6339+ * Copyright 2022 Huawei Technologies Co., Ltd 6340+ * 6341+ * Licensed under the Apache License, Version 2.0 (the "License"); 6342+ * you may not use this file except in compliance with the License. 6343+ * You may obtain a copy of the License at 6344+ * 6345+ * http://www.apache.org/licenses/LICENSE-2.0 6346+ * 6347+ * Unless required by applicable law or agreed to in writing, software 6348+ * distributed under the License is distributed on an "AS IS" BASIS, 6349+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6350+ * See the License for the specific language governing permissions and 6351+ * limitations under the License. 6352+ */ 6353+ 6354+#ifndef MINDSPORE_NNACL_FP32_SUB_AVX512_H_ 6355+#define MINDSPORE_NNACL_FP32_SUB_AVX512_H_ 6356+ 6357+#include "nnacl/intrinsics/ms_simd_instructions.h" 6358+#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" 6359+ 6360+#ifdef __cplusplus 6361+extern "C" { 6362+#endif 6363+#pragma GCC push_options 6364+#pragma GCC target("avx512f") 6365+#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION 6366+#define BLOCK_NUM 16 6367+#define MS_SIMD_AVX512 6368+ 6369+static inline int ElementOptSubNum0AVX512(int index, const float *in0, const float *in1, float *out, 6370+ int size) { 6371+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 6372+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6373+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 6374+ SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1); 6375+ SIMD_ST_F32(out + index, vout); 6376+ } 6377+ return index; 6378+} 6379+ 6380+static inline int ElementOptSubNum1AVX512(int index, const float *in0, const float *in1, float *out, 6381+ int size) { 6382+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 6383+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6384+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 6385+ SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_); 6386+ SIMD_ST_F32(out + index, vout); 6387+ } 6388+ return index; 6389+} 6390+ 6391+static inline int ElementOptSubIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { 6392+ SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); 6393+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6394+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 6395+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1); 6396+ SIMD_ST_EPI32(out + index, vout); 6397+ } 6398+ return index; 6399+} 6400+ 6401+static inline int ElementOptSubIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { 6402+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 6403+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6404+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 6405+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_); 6406+ SIMD_ST_EPI32(out + index, vout); 6407+ } 6408+ return index; 6409+} 6410+ 6411+static inline int ElementOptSubReluNum0AVX512(int index, const float *in0, const float *in1, float *out, 6412+ int size) { 6413+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 6414+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6415+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 6416+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f); 6417+ SIMD_ST_F32(out + index, vout); 6418+ } 6419+ return index; 6420+} 6421+ 6422+static inline int ElementOptSubReluNum1AVX512(int index, const float *in0, const float *in1, float *out, 6423+ int size) { 6424+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 6425+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6426+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 6427+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f); 6428+ SIMD_ST_F32(out + index, vout); 6429+ } 6430+ return index; 6431+} 6432+ 6433+static inline int ElementOptSubRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out, 6434+ int size) { 6435+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 6436+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6437+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 6438+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f); 6439+ SIMD_ST_F32(out + index, vout); 6440+ } 6441+ return index; 6442+} 6443+ 6444+static inline int ElementOptSubRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out, 6445+ int size) { 6446+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 6447+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6448+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 6449+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f); 6450+ SIMD_ST_F32(out + index, vout); 6451+ } 6452+ return index; 6453+} 6454+ 6455+static inline int ElementSubAVX512(int index, const float *in0, const float *in1, float *out, int size) { 6456+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6457+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 6458+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 6459+ SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1); 6460+ SIMD_ST_F32(out + index, vout); 6461+ } 6462+ return index; 6463+} 6464+ 6465+static inline int ElementSubIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { 6466+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6467+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 6468+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 6469+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1); 6470+ SIMD_ST_EPI32(out + index, vout); 6471+ } 6472+ return index; 6473+} 6474+ 6475+static inline int ElementSubReluAVX512(int index, const float *in0, const float *in1, float *out, 6476+ int size) { 6477+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6478+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 6479+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 6480+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f); 6481+ SIMD_ST_F32(out + index, vout); 6482+ } 6483+ return index; 6484+} 6485+ 6486+static inline int ElementSubRelu6AVX512(int index, const float *in0, const float *in1, float *out, 6487+ int size) { 6488+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 6489+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 6490+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 6491+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f); 6492+ SIMD_ST_F32(out + index, vout); 6493+ } 6494+ return index; 6495+} 6496+ 6497+#undef MS_SIMD_INSTRUCTION 6498+#undef BLOCK_NUM 6499+#pragma GCC pop_options 6500+#undef MS_SIMD_AVX512 6501+#ifdef __cplusplus 6502+}; 6503+#endif 6504+#endif 6505diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h 6506new file mode 100644 6507index 00000000..88908c90 6508--- /dev/null 6509+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h 6510@@ -0,0 +1,36 @@ 6511+/** 6512+ * Copyright 2022 Huawei Technologies Co., Ltd 6513+ * 6514+ * Licensed under the Apache License, Version 2.0 (the "License"); 6515+ * you may not use this file except in compliance with the License. 6516+ * You may obtain a copy of the License at 6517+ * 6518+ * http://www.apache.org/licenses/LICENSE-2.0 6519+ * 6520+ * Unless required by applicable law or agreed to in writing, software 6521+ * distributed under the License is distributed on an "AS IS" BASIS, 6522+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6523+ * See the License for the specific language governing permissions and 6524+ * limitations under the License. 6525+ */ 6526+#ifndef MINDSPORE_NNACL_BATCHNORM_FP32_SIMD_H_ 6527+#define MINDSPORE_NNACL_BATCHNORM_FP32_SIMD_H_ 6528+ 6529+#include "nnacl/intrinsics/ms_simd_instructions.h" 6530+#ifdef ENABLE_AVX512 6531+#include "nnacl/avx512/batchnorm_fp32_avx512.h" 6532+#endif 6533+ 6534+#ifdef ENABLE_AVX 6535+#include "nnacl/avx/batchnorm_fp32_avx.h" 6536+#endif 6537+ 6538+#ifdef ENABLE_SSE 6539+#include "nnacl/sse/batchnorm_fp32_sse.h" 6540+#endif 6541+ 6542+#ifdef ENABLE_ARM 6543+#include "nnacl/neon/batchnorm_fp32_neon.h" 6544+#endif 6545+ 6546+#endif 6547diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h 6548new file mode 100644 6549index 00000000..f36981ab 6550--- /dev/null 6551+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h 6552@@ -0,0 +1,36 @@ 6553+/** 6554+ * Copyright 2022 Huawei Technologies Co., Ltd 6555+ * 6556+ * Licensed under the Apache License, Version 2.0 (the "License"); 6557+ * you may not use this file except in compliance with the License. 6558+ * You may obtain a copy of the License at 6559+ * 6560+ * http://www.apache.org/licenses/LICENSE-2.0 6561+ * 6562+ * Unless required by applicable law or agreed to in writing, software 6563+ * distributed under the License is distributed on an "AS IS" BASIS, 6564+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6565+ * See the License for the specific language governing permissions and 6566+ * limitations under the License. 6567+ */ 6568+#ifndef MINDSPORE_NNACL_BCE_WITH_LOGITS_LOSS_FP32_SIMD_H_ 6569+#define MINDSPORE_NNACL_BCE_WITH_LOGITS_LOSS_FP32_SIMD_H_ 6570+ 6571+#include "nnacl/intrinsics/ms_simd_instructions.h" 6572+#ifdef ENABLE_AVX512 6573+#include "nnacl/avx512/bce_with_logits_loss_fp32_avx512.h" 6574+#endif 6575+ 6576+#ifdef ENABLE_AVX 6577+#include "nnacl/avx/bce_with_logits_loss_fp32_avx.h" 6578+#endif 6579+ 6580+#ifdef ENABLE_SSE 6581+#include "nnacl/sse/bce_with_logits_loss_fp32_sse.h" 6582+#endif 6583+ 6584+#ifdef ENABLE_ARM 6585+#include "nnacl/neon/bce_with_logits_loss_fp32_neon.h" 6586+#endif 6587+ 6588+#endif 6589diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h 6590new file mode 100644 6591index 00000000..e765b1eb 6592--- /dev/null 6593+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h 6594@@ -0,0 +1,36 @@ 6595+/** 6596+ * Copyright 2022 Huawei Technologies Co., Ltd 6597+ * 6598+ * Licensed under the Apache License, Version 2.0 (the "License"); 6599+ * you may not use this file except in compliance with the License. 6600+ * You may obtain a copy of the License at 6601+ * 6602+ * http://www.apache.org/licenses/LICENSE-2.0 6603+ * 6604+ * Unless required by applicable law or agreed to in writing, software 6605+ * distributed under the License is distributed on an "AS IS" BASIS, 6606+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6607+ * See the License for the specific language governing permissions and 6608+ * limitations under the License. 6609+ */ 6610+#ifndef MINDSPORE_NNACL_BIAS_ADD_SIMD_H_ 6611+#define MINDSPORE_NNACL_BIAS_ADD_SIMD_H_ 6612+ 6613+#include "nnacl/intrinsics/ms_simd_instructions.h" 6614+#ifdef ENABLE_AVX512 6615+#include "nnacl/avx512/bias_add_avx512.h" 6616+#endif 6617+ 6618+#ifdef ENABLE_AVX 6619+#include "nnacl/avx/bias_add_avx.h" 6620+#endif 6621+ 6622+#ifdef ENABLE_SSE 6623+#include "nnacl/sse/bias_add_sse.h" 6624+#endif 6625+ 6626+#ifdef ENABLE_ARM 6627+#include "nnacl/neon/bias_add_neon.h" 6628+#endif 6629+ 6630+#endif 6631diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h 6632new file mode 100644 6633index 00000000..93d8ca33 6634--- /dev/null 6635+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h 6636@@ -0,0 +1,36 @@ 6637+/** 6638+ * Copyright 2022 Huawei Technologies Co., Ltd 6639+ * 6640+ * Licensed under the Apache License, Version 2.0 (the "License"); 6641+ * you may not use this file except in compliance with the License. 6642+ * You may obtain a copy of the License at 6643+ * 6644+ * http://www.apache.org/licenses/LICENSE-2.0 6645+ * 6646+ * Unless required by applicable law or agreed to in writing, software 6647+ * distributed under the License is distributed on an "AS IS" BASIS, 6648+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6649+ * See the License for the specific language governing permissions and 6650+ * limitations under the License. 6651+ */ 6652+#ifndef MINDSPORE_NNACL_CAST_BASE_SIMD_H_ 6653+#define MINDSPORE_NNACL_CAST_BASE_SIMD_H_ 6654+ 6655+#include "nnacl/intrinsics/ms_simd_instructions.h" 6656+#ifdef ENABLE_AVX512 6657+#include "nnacl/avx512/cast_base_avx512.h" 6658+#endif 6659+ 6660+#ifdef ENABLE_AVX 6661+#include "nnacl/avx/cast_base_avx.h" 6662+#endif 6663+ 6664+#ifdef ENABLE_SSE 6665+#include "nnacl/sse/cast_base_sse.h" 6666+#endif 6667+ 6668+#ifdef ENABLE_ARM 6669+#include "nnacl/neon/cast_base_neon.h" 6670+#endif 6671+ 6672+#endif 6673diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h 6674new file mode 100644 6675index 00000000..70f79645 6676--- /dev/null 6677+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h 6678@@ -0,0 +1,36 @@ 6679+/** 6680+ * Copyright 2022 Huawei Technologies Co., Ltd 6681+ * 6682+ * Licensed under the Apache License, Version 2.0 (the "License"); 6683+ * you may not use this file except in compliance with the License. 6684+ * You may obtain a copy of the License at 6685+ * 6686+ * http://www.apache.org/licenses/LICENSE-2.0 6687+ * 6688+ * Unless required by applicable law or agreed to in writing, software 6689+ * distributed under the License is distributed on an "AS IS" BASIS, 6690+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6691+ * See the License for the specific language governing permissions and 6692+ * limitations under the License. 6693+ */ 6694+#ifndef MINDSPORE_NNACL_CDIST_FP32_SIMD_H_ 6695+#define MINDSPORE_NNACL_CDIST_FP32_SIMD_H_ 6696+ 6697+#include "nnacl/intrinsics/ms_simd_instructions.h" 6698+#ifdef ENABLE_AVX512 6699+#include "nnacl/avx512/cdist_fp32_avx512.h" 6700+#endif 6701+ 6702+#ifdef ENABLE_AVX 6703+#include "nnacl/avx/cdist_fp32_avx.h" 6704+#endif 6705+ 6706+#ifdef ENABLE_SSE 6707+#include "nnacl/sse/cdist_fp32_sse.h" 6708+#endif 6709+ 6710+#ifdef ENABLE_ARM 6711+#include "nnacl/neon/cdist_fp32_neon.h" 6712+#endif 6713+ 6714+#endif 6715diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h 6716new file mode 100644 6717index 00000000..b6979626 6718--- /dev/null 6719+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h 6720@@ -0,0 +1,36 @@ 6721+/** 6722+ * Copyright 2022 Huawei Technologies Co., Ltd 6723+ * 6724+ * Licensed under the Apache License, Version 2.0 (the "License"); 6725+ * you may not use this file except in compliance with the License. 6726+ * You may obtain a copy of the License at 6727+ * 6728+ * http://www.apache.org/licenses/LICENSE-2.0 6729+ * 6730+ * Unless required by applicable law or agreed to in writing, software 6731+ * distributed under the License is distributed on an "AS IS" BASIS, 6732+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6733+ * See the License for the specific language governing permissions and 6734+ * limitations under the License. 6735+ */ 6736+#ifndef MINDSPORE_NNACL_CUMSUM_FP32_SIMD_H_ 6737+#define MINDSPORE_NNACL_CUMSUM_FP32_SIMD_H_ 6738+ 6739+#include "nnacl/intrinsics/ms_simd_instructions.h" 6740+#ifdef ENABLE_AVX512 6741+#include "nnacl/avx512/cumsum_fp32_avx512.h" 6742+#endif 6743+ 6744+#ifdef ENABLE_AVX 6745+#include "nnacl/avx/cumsum_fp32_avx.h" 6746+#endif 6747+ 6748+#ifdef ENABLE_SSE 6749+#include "nnacl/sse/cumsum_fp32_sse.h" 6750+#endif 6751+ 6752+#ifdef ENABLE_ARM 6753+#include "nnacl/neon/cumsum_fp32_neon.h" 6754+#endif 6755+ 6756+#endif 6757diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h 6758new file mode 100644 6759index 00000000..dcae16ff 6760--- /dev/null 6761+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h 6762@@ -0,0 +1,36 @@ 6763+/** 6764+ * Copyright 2022 Huawei Technologies Co., Ltd 6765+ * 6766+ * Licensed under the Apache License, Version 2.0 (the "License"); 6767+ * you may not use this file except in compliance with the License. 6768+ * You may obtain a copy of the License at 6769+ * 6770+ * http://www.apache.org/licenses/LICENSE-2.0 6771+ * 6772+ * Unless required by applicable law or agreed to in writing, software 6773+ * distributed under the License is distributed on an "AS IS" BASIS, 6774+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6775+ * See the License for the specific language governing permissions and 6776+ * limitations under the License. 6777+ */ 6778+#ifndef MINDSPORE_NNACL_DIV_FP32_SIMD_H_ 6779+#define MINDSPORE_NNACL_DIV_FP32_SIMD_H_ 6780+ 6781+#include "nnacl/intrinsics/ms_simd_instructions.h" 6782+#ifdef ENABLE_AVX512 6783+#include "nnacl/avx512/div_fp32_avx512.h" 6784+#endif 6785+ 6786+#ifdef ENABLE_AVX 6787+#include "nnacl/avx/div_fp32_avx.h" 6788+#endif 6789+ 6790+#ifdef ENABLE_SSE 6791+#include "nnacl/sse/div_fp32_sse.h" 6792+#endif 6793+ 6794+#ifdef ENABLE_ARM 6795+#include "nnacl/neon/div_fp32_neon.h" 6796+#endif 6797+ 6798+#endif 6799diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h 6800new file mode 100644 6801index 00000000..704591c5 6802--- /dev/null 6803+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h 6804@@ -0,0 +1,36 @@ 6805+/** 6806+ * Copyright 2022 Huawei Technologies Co., Ltd 6807+ * 6808+ * Licensed under the Apache License, Version 2.0 (the "License"); 6809+ * you may not use this file except in compliance with the License. 6810+ * You may obtain a copy of the License at 6811+ * 6812+ * http://www.apache.org/licenses/LICENSE-2.0 6813+ * 6814+ * Unless required by applicable law or agreed to in writing, software 6815+ * distributed under the License is distributed on an "AS IS" BASIS, 6816+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6817+ * See the License for the specific language governing permissions and 6818+ * limitations under the License. 6819+ */ 6820+#ifndef MINDSPORE_NNACL_DROPOUT_FP32_SIMD_H_ 6821+#define MINDSPORE_NNACL_DROPOUT_FP32_SIMD_H_ 6822+ 6823+#include "nnacl/intrinsics/ms_simd_instructions.h" 6824+#ifdef ENABLE_AVX512 6825+#include "nnacl/avx512/dropout_fp32_avx512.h" 6826+#endif 6827+ 6828+#ifdef ENABLE_AVX 6829+#include "nnacl/avx/dropout_fp32_avx.h" 6830+#endif 6831+ 6832+#ifdef ENABLE_SSE 6833+#include "nnacl/sse/dropout_fp32_sse.h" 6834+#endif 6835+ 6836+#ifdef ENABLE_ARM 6837+#include "nnacl/neon/dropout_fp32_neon.h" 6838+#endif 6839+ 6840+#endif 6841diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h 6842new file mode 100644 6843index 00000000..272f5934 6844--- /dev/null 6845+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h 6846@@ -0,0 +1,36 @@ 6847+/** 6848+ * Copyright 2022 Huawei Technologies Co., Ltd 6849+ * 6850+ * Licensed under the Apache License, Version 2.0 (the "License"); 6851+ * you may not use this file except in compliance with the License. 6852+ * You may obtain a copy of the License at 6853+ * 6854+ * http://www.apache.org/licenses/LICENSE-2.0 6855+ * 6856+ * Unless required by applicable law or agreed to in writing, software 6857+ * distributed under the License is distributed on an "AS IS" BASIS, 6858+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6859+ * See the License for the specific language governing permissions and 6860+ * limitations under the License. 6861+ */ 6862+#ifndef MINDSPORE_NNACL_EXP_FP32_SIMD_H_ 6863+#define MINDSPORE_NNACL_EXP_FP32_SIMD_H_ 6864+ 6865+#include "nnacl/intrinsics/ms_simd_instructions.h" 6866+#ifdef ENABLE_AVX512 6867+#include "nnacl/avx512/exp_fp32_avx512.h" 6868+#endif 6869+ 6870+#ifdef ENABLE_AVX 6871+#include "nnacl/avx/exp_fp32_avx.h" 6872+#endif 6873+ 6874+#ifdef ENABLE_SSE 6875+#include "nnacl/sse/exp_fp32_sse.h" 6876+#endif 6877+ 6878+#ifdef ENABLE_ARM 6879+#include "nnacl/neon/exp_fp32_neon.h" 6880+#endif 6881+ 6882+#endif 6883diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h 6884new file mode 100644 6885index 00000000..f3099405 6886--- /dev/null 6887+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h 6888@@ -0,0 +1,36 @@ 6889+/** 6890+ * Copyright 2022 Huawei Technologies Co., Ltd 6891+ * 6892+ * Licensed under the Apache License, Version 2.0 (the "License"); 6893+ * you may not use this file except in compliance with the License. 6894+ * You may obtain a copy of the License at 6895+ * 6896+ * http://www.apache.org/licenses/LICENSE-2.0 6897+ * 6898+ * Unless required by applicable law or agreed to in writing, software 6899+ * distributed under the License is distributed on an "AS IS" BASIS, 6900+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6901+ * See the License for the specific language governing permissions and 6902+ * limitations under the License. 6903+ */ 6904+#ifndef MINDSPORE_NNACL_FILL_BASE_SIMD_H_ 6905+#define MINDSPORE_NNACL_FILL_BASE_SIMD_H_ 6906+ 6907+#include "nnacl/intrinsics/ms_simd_instructions.h" 6908+#ifdef ENABLE_AVX512 6909+#include "nnacl/avx512/fill_base_avx512.h" 6910+#endif 6911+ 6912+#ifdef ENABLE_AVX 6913+#include "nnacl/avx/fill_base_avx.h" 6914+#endif 6915+ 6916+#ifdef ENABLE_SSE 6917+#include "nnacl/sse/fill_base_sse.h" 6918+#endif 6919+ 6920+#ifdef ENABLE_ARM 6921+#include "nnacl/neon/fill_base_neon.h" 6922+#endif 6923+ 6924+#endif 6925diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h 6926new file mode 100644 6927index 00000000..a3931c20 6928--- /dev/null 6929+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h 6930@@ -0,0 +1,36 @@ 6931+/** 6932+ * Copyright 2022 Huawei Technologies Co., Ltd 6933+ * 6934+ * Licensed under the Apache License, Version 2.0 (the "License"); 6935+ * you may not use this file except in compliance with the License. 6936+ * You may obtain a copy of the License at 6937+ * 6938+ * http://www.apache.org/licenses/LICENSE-2.0 6939+ * 6940+ * Unless required by applicable law or agreed to in writing, software 6941+ * distributed under the License is distributed on an "AS IS" BASIS, 6942+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6943+ * See the License for the specific language governing permissions and 6944+ * limitations under the License. 6945+ */ 6946+#ifndef MINDSPORE_NNACL_GROUP_NORM_FP32_SIMD_H_ 6947+#define MINDSPORE_NNACL_GROUP_NORM_FP32_SIMD_H_ 6948+ 6949+#include "nnacl/intrinsics/ms_simd_instructions.h" 6950+#ifdef ENABLE_AVX512 6951+#include "nnacl/avx512/group_norm_fp32_avx512.h" 6952+#endif 6953+ 6954+#ifdef ENABLE_AVX 6955+#include "nnacl/avx/group_norm_fp32_avx.h" 6956+#endif 6957+ 6958+#ifdef ENABLE_SSE 6959+#include "nnacl/sse/group_norm_fp32_sse.h" 6960+#endif 6961+ 6962+#ifdef ENABLE_ARM 6963+#include "nnacl/neon/group_norm_fp32_neon.h" 6964+#endif 6965+ 6966+#endif 6967diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h 6968new file mode 100644 6969index 00000000..c08461d3 6970--- /dev/null 6971+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h 6972@@ -0,0 +1,36 @@ 6973+/** 6974+ * Copyright 2022 Huawei Technologies Co., Ltd 6975+ * 6976+ * Licensed under the Apache License, Version 2.0 (the "License"); 6977+ * you may not use this file except in compliance with the License. 6978+ * You may obtain a copy of the License at 6979+ * 6980+ * http://www.apache.org/licenses/LICENSE-2.0 6981+ * 6982+ * Unless required by applicable law or agreed to in writing, software 6983+ * distributed under the License is distributed on an "AS IS" BASIS, 6984+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6985+ * See the License for the specific language governing permissions and 6986+ * limitations under the License. 6987+ */ 6988+#ifndef MINDSPORE_NNACL_LAYER_NORM_FP32_SIMD_H_ 6989+#define MINDSPORE_NNACL_LAYER_NORM_FP32_SIMD_H_ 6990+ 6991+#include "nnacl/intrinsics/ms_simd_instructions.h" 6992+#ifdef ENABLE_AVX512 6993+#include "nnacl/avx512/layer_norm_fp32_avx512.h" 6994+#endif 6995+ 6996+#ifdef ENABLE_AVX 6997+#include "nnacl/avx/layer_norm_fp32_avx.h" 6998+#endif 6999+ 7000+#ifdef ENABLE_SSE 7001+#include "nnacl/sse/layer_norm_fp32_sse.h" 7002+#endif 7003+ 7004+#ifdef ENABLE_ARM 7005+#include "nnacl/neon/layer_norm_fp32_neon.h" 7006+#endif 7007+ 7008+#endif 7009diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h 7010new file mode 100644 7011index 00000000..1250f3fc 7012--- /dev/null 7013+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h 7014@@ -0,0 +1,36 @@ 7015+/** 7016+ * Copyright 2022 Huawei Technologies Co., Ltd 7017+ * 7018+ * Licensed under the Apache License, Version 2.0 (the "License"); 7019+ * you may not use this file except in compliance with the License. 7020+ * You may obtain a copy of the License at 7021+ * 7022+ * http://www.apache.org/licenses/LICENSE-2.0 7023+ * 7024+ * Unless required by applicable law or agreed to in writing, software 7025+ * distributed under the License is distributed on an "AS IS" BASIS, 7026+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7027+ * See the License for the specific language governing permissions and 7028+ * limitations under the License. 7029+ */ 7030+#ifndef MINDSPORE_NNACL_MATMUL_FP32_SIMD_H_ 7031+#define MINDSPORE_NNACL_MATMUL_FP32_SIMD_H_ 7032+ 7033+#include "nnacl/intrinsics/ms_simd_instructions.h" 7034+#ifdef ENABLE_AVX512 7035+#include "nnacl/avx512/matmul_fp32_avx512.h" 7036+#endif 7037+ 7038+#ifdef ENABLE_AVX 7039+#include "nnacl/avx/matmul_fp32_avx.h" 7040+#endif 7041+ 7042+#ifdef ENABLE_SSE 7043+#include "nnacl/sse/matmul_fp32_sse.h" 7044+#endif 7045+ 7046+#ifdef ENABLE_ARM 7047+#include "nnacl/neon/matmul_fp32_neon.h" 7048+#endif 7049+ 7050+#endif 7051diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h 7052new file mode 100644 7053index 00000000..31e08b08 7054--- /dev/null 7055+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h 7056@@ -0,0 +1,36 @@ 7057+/** 7058+ * Copyright 2022 Huawei Technologies Co., Ltd 7059+ * 7060+ * Licensed under the Apache License, Version 2.0 (the "License"); 7061+ * you may not use this file except in compliance with the License. 7062+ * You may obtain a copy of the License at 7063+ * 7064+ * http://www.apache.org/licenses/LICENSE-2.0 7065+ * 7066+ * Unless required by applicable law or agreed to in writing, software 7067+ * distributed under the License is distributed on an "AS IS" BASIS, 7068+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7069+ * See the License for the specific language governing permissions and 7070+ * limitations under the License. 7071+ */ 7072+#ifndef MINDSPORE_NNACL_MUL_FP32_SIMD_H_ 7073+#define MINDSPORE_NNACL_MUL_FP32_SIMD_H_ 7074+ 7075+#include "nnacl/intrinsics/ms_simd_instructions.h" 7076+#ifdef ENABLE_AVX512 7077+#include "nnacl/avx512/mul_fp32_avx512.h" 7078+#endif 7079+ 7080+#ifdef ENABLE_AVX 7081+#include "nnacl/avx/mul_fp32_avx.h" 7082+#endif 7083+ 7084+#ifdef ENABLE_SSE 7085+#include "nnacl/sse/mul_fp32_sse.h" 7086+#endif 7087+ 7088+#ifdef ENABLE_ARM 7089+#include "nnacl/neon/mul_fp32_neon.h" 7090+#endif 7091+ 7092+#endif 7093diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h 7094new file mode 100644 7095index 00000000..42d163f6 7096--- /dev/null 7097+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h 7098@@ -0,0 +1,220 @@ 7099+/** 7100+ * Copyright 2022 Huawei Technologies Co., Ltd 7101+ * 7102+ * Licensed under the Apache License, Version 2.0 (the "License"); 7103+ * you may not use this file except in compliance with the License. 7104+ * You may obtain a copy of the License at 7105+ * 7106+ * http://www.apache.org/licenses/LICENSE-2.0 7107+ * 7108+ * Unless required by applicable law or agreed to in writing, software 7109+ * distributed under the License is distributed on an "AS IS" BASIS, 7110+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7111+ * See the License for the specific language governing permissions and 7112+ * limitations under the License. 7113+ */ 7114+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ 7115+#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ 7116+ 7117+#include "nnacl/intrinsics/ms_simd_instructions.h" 7118+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 7119+ 7120+#ifdef __cplusplus 7121+extern "C" { 7122+#endif 7123+ 7124+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 7125+#define BLOCK_NUM 4 7126+#define MS_SIMD_NEON 7127+ 7128+static inline int Fp32ReluNEON(int index, const float *src, int length, float *dst) { 7129+ SIMD_F32 zero = SIMD_SET0_F32; 7130+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7131+ SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero)); 7132+ } 7133+ return index; 7134+} 7135+ 7136+static inline int Int32ReluNEON(int index, const int32_t *src, int length, int32_t *dst) { 7137+ SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f); 7138+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7139+ SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero)); 7140+ } 7141+ return index; 7142+} 7143+ 7144+static inline int Fp32Relu6NEON(int index, const float *src, int length, float *dst) { 7145+ SIMD_F32 zero = SIMD_SET0_F32; 7146+ SIMD_F32 six = SIMD_MOV_F32(6.0f); 7147+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7148+ SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six)); 7149+ } 7150+ return index; 7151+} 7152+ 7153+static inline int LReluNEON(int index, const float *src, int length, float *dst, float alpha) { 7154+ SIMD_F32 alpha_data = SIMD_MOV_F32(alpha); 7155+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7156+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 7157+ SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp); 7158+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask)); 7159+ } 7160+ return index; 7161+} 7162+ 7163+static inline int SigmoidNEON(int index, const float *src, int length, float *dst) { 7164+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7165+ SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index); 7166+ SIMD_ST_F32(dst + index, 7167+ SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); 7168+ } 7169+ return index; 7170+} 7171+ 7172+static inline int TanhNEON(int index, const float *src, int length, float *dst) { 7173+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7174+ SIMD_F32 input = SIMD_LD_F32(src + index); 7175+ SIMD_ST_F32(dst + index, SIMD_TANH_F32(input)); 7176+ } 7177+ return index; 7178+} 7179+ 7180+static inline int SwishNEON(int index, const float *src, int length, float *dst) { 7181+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7182+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 7183+ SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index); 7184+ SIMD_ST_F32(dst + index, 7185+ SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); 7186+ } 7187+ return index; 7188+} 7189+ 7190+static inline int HSwishNEON(int index, const float *src, int length, float *dst) { 7191+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7192+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 7193+ SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); 7194+ SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6)); 7195+ } 7196+ return index; 7197+} 7198+ 7199+static inline int HSigmoidNEON(int index, const float *src, int length, float *dst) { 7200+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7201+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 7202+ SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); 7203+ SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6)); 7204+ } 7205+ return index; 7206+} 7207+ 7208+static inline int HardTanhNoLimitMinNEON(int index, const float *src, int length, float *dst, float min_val, 7209+ float max_val) { 7210+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7211+ SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val)); 7212+ } 7213+ return index; 7214+} 7215+ 7216+static inline int HardTanhNoLimitMaxNEON(int index, const float *src, int length, float *dst, float min_val, 7217+ float max_val) { 7218+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7219+ SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val)); 7220+ } 7221+ return index; 7222+} 7223+ 7224+static inline int HardTanhLimitMinMaxNEON(int index, const float *src, int length, float *dst, float min_val, 7225+ float max_val) { 7226+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7227+ SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val)); 7228+ } 7229+ return index; 7230+} 7231+ 7232+static inline int GeluApproximateNEON(int index, const float *src, int length, float *dst) { 7233+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7234+ SIMD_F32 in = SIMD_LD_F32(src + index); 7235+ SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in); 7236+ SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in); 7237+ SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f))); 7238+ } 7239+ return index; 7240+} 7241+ 7242+static inline int GeluNEON(int index, const float *src, int length, float *dst) { 7243+ SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f); 7244+ SIMD_F32 para2 = SIMD_MOV_F32(1.0f); 7245+ SIMD_F32 para3 = SIMD_MOV_F32(0.5f); 7246+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7247+ SIMD_F32 in = SIMD_LD_F32(src + index); 7248+ SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1)))); 7249+ SIMD_ST_F32(dst + index, res); 7250+ } 7251+ return index; 7252+} 7253+ 7254+static inline int EluNEON(int index, const float *src, int length, float *dst, float alpha) { 7255+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7256+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 7257+ SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f); 7258+ SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); 7259+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); 7260+ } 7261+ return index; 7262+} 7263+ 7264+static inline int CeluNEON(int index, const float *src, int length, float *dst, float alpha) { 7265+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7266+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 7267+ SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f); 7268+ SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); 7269+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); 7270+ } 7271+ return index; 7272+} 7273+ 7274+static inline int HShrinkNEON(int index, const float *src, int length, float *dst, float lambd) { 7275+ const float neg_lambd = -1 * lambd; 7276+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7277+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 7278+ SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd)); 7279+ SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp); 7280+ SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); 7281+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask)); 7282+ } 7283+ return index; 7284+} 7285+ 7286+static inline int SoftShrinkNEON(int index, const float *src, int length, float *dst, float lambd) { 7287+ SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); 7288+ SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); 7289+ 7290+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7291+ SIMD_F32 src_t = SIMD_LD_F32(src + index); 7292+ /* v0 = (in > lamdb) & (in - lamdb) */ 7293+ SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v)); 7294+ /* v1 = (in < -lamdb) & (in + lamdb) */ 7295+ SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v)); 7296+ /* out = (v0 | v1) */ 7297+ SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1)); 7298+ } 7299+ return index; 7300+} 7301+ 7302+static inline int SoftsignFp32OptNEON(int index, const float *src, int length, float *dst) { 7303+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7304+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 7305+ SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp)); 7306+ SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp)); 7307+ } 7308+ return index; 7309+} 7310+ 7311+#undef MS_SIMD_INSTRUCTION 7312+#undef BLOCK_NUM 7313+ 7314+#undef MS_SIMD_NEON 7315+#ifdef __cplusplus 7316+} 7317+#endif 7318+#endif 7319diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h 7320new file mode 100644 7321index 00000000..df832e51 7322--- /dev/null 7323+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h 7324@@ -0,0 +1,56 @@ 7325+/** 7326+ * Copyright 2022 Huawei Technologies Co., Ltd 7327+ * 7328+ * Licensed under the Apache License, Version 2.0 (the "License"); 7329+ * you may not use this file except in compliance with the License. 7330+ * You may obtain a copy of the License at 7331+ * 7332+ * http://www.apache.org/licenses/LICENSE-2.0 7333+ * 7334+ * Unless required by applicable law or agreed to in writing, software 7335+ * distributed under the License is distributed on an "AS IS" BASIS, 7336+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7337+ * See the License for the specific language governing permissions and 7338+ * limitations under the License. 7339+ */ 7340+ 7341+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_NEON_H_ 7342+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_NEON_H_ 7343+ 7344+#include "nnacl/intrinsics/ms_simd_instructions.h" 7345+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 7346+ 7347+#ifdef __cplusplus 7348+extern "C" { 7349+#endif 7350+ 7351+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 7352+#define BLOCK_NUM 4 7353+#define MS_SIMD_NEON 7354+ 7355+static inline int ShrinkGradNEON(int index, const float *src0, const float *src1, 7356+ int length, float *dst, float lambd) { 7357+ SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); 7358+ SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); 7359+ 7360+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7361+ SIMD_F32 src0_t = SIMD_LD_F32(src0 + index); 7362+ SIMD_F32 src1_t = SIMD_LD_F32(src1 + index); 7363+ 7364+ SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v); 7365+ SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t); 7366+ SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); 7367+ 7368+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask)); 7369+ } 7370+ return index; 7371+} 7372+ 7373+#undef MS_SIMD_INSTRUCTION 7374+#undef BLOCK_NUM 7375+ 7376+#undef MS_SIMD_NEON 7377+#ifdef __cplusplus 7378+} 7379+#endif 7380+#endif 7381diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h 7382new file mode 100644 7383index 00000000..fda41ec2 7384--- /dev/null 7385+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h 7386@@ -0,0 +1,209 @@ 7387+/** 7388+ * Copyright 2022 Huawei Technologies Co., Ltd 7389+ * 7390+ * Licensed under the Apache License, Version 2.0 (the "License"); 7391+ * you may not use this file except in compliance with the License. 7392+ * You may obtain a copy of the License at 7393+ * 7394+ * http://www.apache.org/licenses/LICENSE-2.0 7395+ * 7396+ * Unless required by applicable law or agreed to in writing, software 7397+ * distributed under the License is distributed on an "AS IS" BASIS, 7398+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7399+ * See the License for the specific language governing permissions and 7400+ * limitations under the License. 7401+ */ 7402+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_NEON_H_ 7403+#define MINDSPORE_NNACL_FP32_ADAM_FP32_NEON_H_ 7404+ 7405+#include "nnacl/intrinsics/ms_simd_instructions.h" 7406+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 7407+ 7408+#ifdef __cplusplus 7409+extern "C" { 7410+#endif 7411+ 7412+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 7413+#define BLOCK_NUM 4 7414+#define MS_SIMD_NEON 7415+#ifdef MS_SIMD_AVX512 7416+ static inline size_t AdamWeightDecayFp32NEON(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 7417+ const float *gradient, size_t end) { 7418+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 7419+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 7420+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 7421+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 7422+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 7423+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 7424+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 7425+ 7426+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7427+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 7428+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 7429+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 7430+ SIMD_F32 g_r = SIMD_LD_F32(gradient + index); 7431+ 7432+ m_r = SIMD_MUL_F32(m_r, beta1_r); 7433+ v_r = SIMD_MUL_F32(v_r, beta2_r); 7434+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 7435+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 7436+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 7437+ avx_r0 = SIMD_SQRT_F32(v_r); 7438+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 7439+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 7440+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 7441+ SIMD_ST_F32(m + index, m_r); 7442+ SIMD_ST_F32(v + index, v_r); 7443+ SIMD_ST_F32(var + index, var_r); 7444+ } 7445+ 7446+ return index; 7447+} 7448+ 7449+static inline size_t FusedCastAdamFp32Fp16NEON(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 7450+ float global_norm_reciprocal, size_t end) { 7451+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 7452+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 7453+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 7454+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 7455+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 7456+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 7457+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 7458+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 7459+ 7460+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7461+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 7462+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 7463+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 7464+ SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); 7465+ 7466+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 7467+ m_r = SIMD_MUL_F32(m_r, beta1_r); 7468+ v_r = SIMD_MUL_F32(v_r, beta2_r); 7469+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 7470+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 7471+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 7472+ avx_r0 = SIMD_SQRT_F32(v_r); 7473+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 7474+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 7475+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 7476+ SIMD_ST_F32(var + index, var_r); 7477+ SIMD_ST_F32(m + index, m_r); 7478+ SIMD_ST_F32(v + index, v_r); 7479+ } 7480+ 7481+ return index; 7482+} 7483+ 7484+static inline size_t FusedCastAdamFp32Fp32NEON(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 7485+ float global_norm_reciprocal, size_t end) { 7486+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 7487+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 7488+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 7489+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 7490+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 7491+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 7492+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 7493+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 7494+ 7495+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7496+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 7497+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 7498+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 7499+ SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); 7500+ 7501+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 7502+ m_r = SIMD_MUL_F32(m_r, beta1_r); 7503+ v_r = SIMD_MUL_F32(v_r, beta2_r); 7504+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 7505+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 7506+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 7507+ avx_r0 = SIMD_SQRT_F32(v_r); 7508+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 7509+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 7510+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 7511+ SIMD_ST_F32(var + index, var_r); 7512+ SIMD_ST_F32(m + index, m_r); 7513+ SIMD_ST_F32(v + index, v_r); 7514+ } 7515+ 7516+ return index; 7517+} 7518+ 7519+static inline size_t FusedCastAdamFp16Fp16NEON(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 7520+ float global_norm_reciprocal, size_t end) { 7521+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 7522+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 7523+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 7524+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 7525+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 7526+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 7527+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 7528+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 7529+ 7530+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7531+ SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); 7532+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 7533+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 7534+ SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); 7535+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 7536+ m_r = SIMD_MUL_F32(m_r, beta1_r); 7537+ v_r = SIMD_MUL_F32(v_r, beta2_r); 7538+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 7539+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 7540+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 7541+ avx_r0 = SIMD_SQRT_F32(v_r); 7542+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 7543+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 7544+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 7545+ SIMD_ST_F32(m + index, m_r); 7546+ SIMD_ST_F32(v + index, v_r); 7547+ SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); 7548+ } 7549+ 7550+ return index; 7551+} 7552+ 7553+static inline size_t FusedCastAdamFp16Fp32NEON(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 7554+ float global_norm_reciprocal, size_t end) { 7555+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 7556+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 7557+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 7558+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 7559+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 7560+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 7561+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 7562+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 7563+ 7564+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7565+ SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); 7566+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 7567+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 7568+ SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); 7569+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 7570+ m_r = SIMD_MUL_F32(m_r, beta1_r); 7571+ v_r = SIMD_MUL_F32(v_r, beta2_r); 7572+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 7573+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 7574+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 7575+ avx_r0 = SIMD_SQRT_F32(v_r); 7576+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 7577+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 7578+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 7579+ SIMD_ST_F32(m + index, m_r); 7580+ SIMD_ST_F32(v + index, v_r); 7581+ SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); 7582+ } 7583+ 7584+ return index; 7585+} 7586+#endif 7587+ 7588+#undef MS_SIMD_INSTRUCTION 7589+#undef BLOCK_NUM 7590+ 7591+#undef MS_SIMD_NEON 7592+#ifdef __cplusplus 7593+} 7594+#endif 7595+#endif 7596diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h 7597new file mode 100644 7598index 00000000..4ef32418 7599--- /dev/null 7600+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h 7601@@ -0,0 +1,123 @@ 7602+/** 7603+ * Copyright 2022 Huawei Technologies Co., Ltd 7604+ * 7605+ * Licensed under the Apache License, Version 2.0 (the "License"); 7606+ * you may not use this file except in compliance with the License. 7607+ * You may obtain a copy of the License at 7608+ * 7609+ * http://www.apache.org/licenses/LICENSE-2.0 7610+ * 7611+ * Unless required by applicable law or agreed to in writing, software 7612+ * distributed under the License is distributed on an "AS IS" BASIS, 7613+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7614+ * See the License for the specific language governing permissions and 7615+ * limitations under the License. 7616+ */ 7617+ 7618+#ifndef MINDSPORE_NNACL_FP32_ADD_NEON_H_ 7619+#define MINDSPORE_NNACL_FP32_ADD_NEON_H_ 7620+ 7621+#include "nnacl/intrinsics/ms_simd_instructions.h" 7622+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 7623+ 7624+#ifdef __cplusplus 7625+extern "C" { 7626+#endif 7627+ 7628+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 7629+#define BLOCK_NUM 4 7630+#define MS_SIMD_NEON 7631+ 7632+static inline int ElementOptAddNEON(int index, const float *in0, const float *in1, float *out, int size) { 7633+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 7634+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7635+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 7636+ SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1); 7637+ SIMD_ST_F32(out + index, vout); 7638+ } 7639+ return index; 7640+} 7641+ 7642+static inline int ElementOptAddIntNEON(int index, const int *in0, const int *in1, int *out, 7643+ int size) { 7644+ SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]); 7645+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7646+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 7647+ SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1); 7648+ SIMD_ST_EPI32(out + index, vout); 7649+ } 7650+ return index; 7651+} 7652+ 7653+static inline int ElementOptAddReluNEON(int index, const float *in0, const float *in1, float *out, 7654+ int size) { 7655+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 7656+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7657+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 7658+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f); 7659+ SIMD_ST_F32(out + index, vout); 7660+ } 7661+ return index; 7662+} 7663+ 7664+static inline int ElementOptAddRelu6NEON(int index, const float *in0, const float *in1, float *out, 7665+ int size) { 7666+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 7667+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7668+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 7669+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f); 7670+ SIMD_ST_F32(out + index, vout); 7671+ } 7672+ return index; 7673+} 7674+ 7675+static inline int ElementAddNEON(int index, const float *in0, const float *in1, float *out, int size) { 7676+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7677+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 7678+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 7679+ SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); 7680+ SIMD_ST_F32(out + index, vout); 7681+ } 7682+ return index; 7683+} 7684+ 7685+static inline int ElementAddReluNEON(int index, const float *in0, const float *in1, float *out, 7686+ int size) { 7687+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7688+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 7689+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 7690+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f); 7691+ SIMD_ST_F32(out + index, vout); 7692+ } 7693+ return index; 7694+} 7695+ 7696+static inline int ElementAddRelu6NEON(int index, const float *in0, const float *in1, float *out, 7697+ int size) { 7698+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7699+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 7700+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 7701+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f); 7702+ SIMD_ST_F32(out + index, vout); 7703+ } 7704+ return index; 7705+} 7706+ 7707+static inline int ElementAddIntNEON(int index, const int *in0, const int *in1, int *out, int size) { 7708+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7709+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 7710+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 7711+ SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1); 7712+ SIMD_ST_EPI32(out + index, vout); 7713+ } 7714+ return index; 7715+} 7716+ 7717+#undef MS_SIMD_INSTRUCTION 7718+#undef BLOCK_NUM 7719+ 7720+#undef MS_SIMD_NEON 7721+#ifdef __cplusplus 7722+} 7723+#endif 7724+#endif 7725diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h 7726new file mode 100644 7727index 00000000..2449c07d 7728--- /dev/null 7729+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h 7730@@ -0,0 +1,253 @@ 7731+/** 7732+ * Copyright 2022 Huawei Technologies Co., Ltd 7733+ * 7734+ * Licensed under the Apache License, Version 2.0 (the "License"); 7735+ * you may not use this file except in compliance with the License. 7736+ * You may obtain a copy of the License at 7737+ * 7738+ * http://www.apache.org/licenses/LICENSE-2.0 7739+ * 7740+ * Unless required by applicable law or agreed to in writing, software 7741+ * distributed under the License is distributed on an "AS IS" BASIS, 7742+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 7743+ * See the License for the specific language governing permissions and 7744+ * limitations under the License. 7745+ */ 7746+ 7747+#ifndef MINDSPORE_NNACL_ARITHMETIC_NEON_H_ 7748+#define MINDSPORE_NNACL_ARITHMETIC_NEON_H_ 7749+ 7750+#include "nnacl/intrinsics/ms_simd_instructions.h" 7751+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 7752+ 7753+#ifdef __cplusplus 7754+extern "C" { 7755+#endif 7756+ 7757+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 7758+#define BLOCK_NUM 4 7759+#define MS_SIMD_NEON 7760+ 7761+#ifndef MS_SIMD_NEON 7762+static inline int ElementFloorModNEON(int index, const float *in0, const float *in1, float *out, int size) { 7763+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7764+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 7765+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 7766+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 7767+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 7768+ SIMD_ST_F32(out + index, out_tmp); 7769+ } 7770+ return index; 7771+} 7772+ 7773+static inline int ElementOptFloorModNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { 7774+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 7775+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7776+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 7777+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 7778+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 7779+ SIMD_ST_F32(out + index, out_tmp); 7780+ } 7781+ return index; 7782+} 7783+ 7784+static inline int ElementOptFloorModNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { 7785+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 7786+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7787+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 7788+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 7789+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 7790+ SIMD_ST_F32(out + index, out_tmp); 7791+ } 7792+ return index; 7793+} 7794+ 7795+static inline int ElementFloorDivNEON(int index, const float *in0, const float *in1, float *out, int size) { 7796+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7797+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 7798+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 7799+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 7800+ SIMD_ST_F32(out + index, floor_tmp); 7801+ } 7802+ return index; 7803+} 7804+ 7805+static inline int ElementOptFloorDivNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { 7806+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 7807+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7808+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 7809+ SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 7810+ SIMD_ST_F32(out + index, out_tmp); 7811+ } 7812+ return index; 7813+} 7814+ 7815+static inline int ElementOptFloorDivNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { 7816+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 7817+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7818+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 7819+ SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 7820+ SIMD_ST_F32(out + index, out_tmp); 7821+ } 7822+ return index; 7823+} 7824+#endif 7825+ 7826+static inline int ElementFloorDivIntNEON(int index, const int *in0, const int *in1, int *out, int size) { 7827+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7828+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 7829+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 7830+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 7831+ SIMD_ST_EPI32(out + index, out_tmp); 7832+ } 7833+ return index; 7834+} 7835+ 7836+static inline int ElementOptFloorDivIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { 7837+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 7838+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7839+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 7840+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 7841+ SIMD_ST_EPI32(out + index, out_tmp); 7842+ } 7843+ return index; 7844+} 7845+ 7846+static inline int ElementOptFloorDivIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { 7847+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 7848+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7849+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 7850+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 7851+ SIMD_ST_EPI32(out + index, out_tmp); 7852+ } 7853+ return index; 7854+} 7855+ 7856+static inline int ElementMaximumNEON(int index, const float *in0, const float *in1, float *out, int size) { 7857+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7858+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 7859+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 7860+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 7861+ SIMD_ST_F32(out + index, out_tmp); 7862+ } 7863+ return index; 7864+} 7865+ 7866+static inline int ElementOptMaximumNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { 7867+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 7868+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7869+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 7870+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 7871+ SIMD_ST_F32(out + index, out_tmp); 7872+ } 7873+ return index; 7874+} 7875+ 7876+static inline int ElementOptMaximumNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { 7877+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 7878+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7879+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 7880+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 7881+ SIMD_ST_F32(out + index, out_tmp); 7882+ } 7883+ return index; 7884+} 7885+ 7886+static inline int ElementMaximumIntNEON(int index, const int *in0, const int *in1, int *out, int size) { 7887+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7888+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 7889+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 7890+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 7891+ SIMD_ST_EPI32(out + index, out_tmp); 7892+ } 7893+ return index; 7894+} 7895+ 7896+static inline int ElementOptMaximumIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { 7897+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 7898+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7899+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 7900+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 7901+ SIMD_ST_EPI32(out + index, out_tmp); 7902+ } 7903+ return index; 7904+} 7905+ 7906+static inline int ElementOptMaximumIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { 7907+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 7908+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7909+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 7910+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 7911+ SIMD_ST_EPI32(out + index, out_tmp); 7912+ } 7913+ return index; 7914+} 7915+ 7916+static inline int ElementMinimumIntNEON(int index, const int *in0, const int *in1, int *out, int size) { 7917+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7918+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 7919+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 7920+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 7921+ SIMD_ST_EPI32(out + index, out_tmp); 7922+ } 7923+ return index; 7924+} 7925+ 7926+static inline int ElementOptMinimumIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { 7927+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 7928+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7929+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 7930+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 7931+ SIMD_ST_EPI32(out + index, out_tmp); 7932+ } 7933+ return index; 7934+} 7935+ 7936+static inline int ElementOptMinimumIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { 7937+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 7938+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7939+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 7940+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 7941+ SIMD_ST_EPI32(out + index, out_tmp); 7942+ } 7943+ return index; 7944+} 7945+ 7946+static inline int ElementMinimumNEON(int index, const float *in0, const float *in1, float *out, int size) { 7947+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7948+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 7949+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 7950+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 7951+ SIMD_ST_F32(out + index, out_tmp); 7952+ } 7953+ return index; 7954+} 7955+ 7956+static inline int ElementOptMinimumNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { 7957+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 7958+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7959+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 7960+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 7961+ SIMD_ST_F32(out + index, out_tmp); 7962+ } 7963+ return index; 7964+} 7965+ 7966+static inline int ElementOptMinimumNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { 7967+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 7968+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 7969+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 7970+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 7971+ SIMD_ST_F32(out + index, out_tmp); 7972+ } 7973+ return index; 7974+} 7975+ 7976+#undef MS_SIMD_INSTRUCTION 7977+#undef BLOCK_NUM 7978+ 7979+#undef MS_SIMD_NEON 7980+#ifdef __cplusplus 7981+} 7982+#endif 7983+#endif 7984diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h 7985new file mode 100644 7986index 00000000..682148d7 7987--- /dev/null 7988+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h 7989@@ -0,0 +1,128 @@ 7990+/** 7991+ * Copyright 2022 Huawei Technologies Co., Ltd 7992+ * 7993+ * Licensed under the Apache License, Version 2.0 (the "License"); 7994+ * you may not use this file except in compliance with the License. 7995+ * You may obtain a copy of the License at 7996+ * 7997+ * http://www.apache.org/licenses/LICENSE-2.0 7998+ * 7999+ * Unless required by applicable law or agreed to in writing, software 8000+ * distributed under the License is distributed on an "AS IS" BASIS, 8001+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8002+ * See the License for the specific language governing permissions and 8003+ * limitations under the License. 8004+ */ 8005+ 8006+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_NEON_H_ 8007+#define MINDSPORE_NNACL_ARITHMETIC_SELF_NEON_H_ 8008+ 8009+#include "nnacl/intrinsics/ms_simd_instructions.h" 8010+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8011+ 8012+#ifdef __cplusplus 8013+extern "C" { 8014+#endif 8015+ 8016+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8017+#define BLOCK_NUM 4 8018+#define MS_SIMD_NEON 8019+ 8020+#if defined(MS_SIMD_AVX512) 8021+// only avx512 support abs fp32 instruction 8022+static inline int ElementAbsNEON(int index, const float *input, float *output, const int element_size) { 8023+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8024+ SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index))); 8025+ } 8026+ return index; 8027+} 8028+ 8029+static inline int ElementAbsIntNEON(int index, const int *input, int *output, const int element_size) { 8030+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8031+ SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index))); 8032+ } 8033+ return index; 8034+} 8035+#endif 8036+ 8037+static inline int ElementSquareNEON(int index, const float *input, float *output, const int element_size) { 8038+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8039+ SIMD_F32 vin = SIMD_LD_F32(input + index); 8040+ SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin)); 8041+ } 8042+ return index; 8043+} 8044+ 8045+static inline int ElementSqrtNEON(int index, const float *input, float *output, const int element_size) { 8046+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8047+ SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index))); 8048+ } 8049+ return index; 8050+} 8051+ 8052+static inline int ElementRsqrtNEON(int index, const float *input, float *output, const int element_size) { 8053+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8054+ SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index))); 8055+ } 8056+ return index; 8057+} 8058+ 8059+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE) 8060+// avx512 dont support round fp32 instruction 8061+static inline int ElementRoundNEON(int index, const float *input, float *output, const int element_size) { 8062+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8063+ SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index))); 8064+ } 8065+ return index; 8066+} 8067+#endif 8068+ 8069+#ifndef MS_SIMD_NEON 8070+// neon dont support floor fp32 instruction 8071+static inline int ElementFloorNEON(int index, const float *input, float *output, const int element_size) { 8072+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8073+ SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index))); 8074+ } 8075+ return index; 8076+} 8077+#endif 8078+ 8079+#ifndef MS_SIMD_NEON 8080+static inline int ElementCeilNEON(int index, const float *input, float *output, const int element_size) { 8081+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8082+ SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index))); 8083+ } 8084+ return index; 8085+} 8086+#endif 8087+ 8088+static inline int ElementNegativeNEON(int index, const float *input, float *output, const int element_size) { 8089+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8090+ SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f)); 8091+ } 8092+ return index; 8093+} 8094+ 8095+static inline int ElementNegativeIntNEON(int index, const int *input, int *output, const int element_size) { 8096+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8097+ SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1)); 8098+ } 8099+ return index; 8100+} 8101+ 8102+static inline int ElementReciprocalNEON(int index, const float *input, float *output, const int element_size) { 8103+ SIMD_F32 num1 = SIMD_MOV_F32(1.0f); 8104+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8105+ SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index))); 8106+ } 8107+ return index; 8108+} 8109+ 8110+#undef MS_SIMD_INSTRUCTION 8111+#undef BLOCK_NUM 8112+ 8113+#undef MS_SIMD_NEON 8114+#ifdef __cplusplus 8115+} 8116+#endif 8117+#endif 8118diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h 8119new file mode 100644 8120index 00000000..5e169d62 8121--- /dev/null 8122+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h 8123@@ -0,0 +1,66 @@ 8124+/** 8125+ * Copyright 2022 Huawei Technologies Co., Ltd 8126+ * 8127+ * Licensed under the Apache License, Version 2.0 (the "License"); 8128+ * you may not use this file except in compliance with the License. 8129+ * You may obtain a copy of the License at 8130+ * 8131+ * http://www.apache.org/licenses/LICENSE-2.0 8132+ * 8133+ * Unless required by applicable law or agreed to in writing, software 8134+ * distributed under the License is distributed on an "AS IS" BASIS, 8135+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8136+ * See the License for the specific language governing permissions and 8137+ * limitations under the License. 8138+ */ 8139+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ 8140+#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ 8141+ 8142+#include "nnacl/intrinsics/ms_simd_instructions.h" 8143+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8144+ 8145+#ifdef __cplusplus 8146+extern "C" { 8147+#endif 8148+ 8149+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8150+#define BLOCK_NUM 4 8151+#define MS_SIMD_NEON 8152+ 8153+static inline int BatchNormFp32NEON(int index, const float *input, const float *mean, 8154+ const float *variance, int channel, float epsilon, float *output) { 8155+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8156+ SIMD_F32 input_data = SIMD_LD_F32(input + index); 8157+ SIMD_F32 mean_ = SIMD_LD_F32(mean + index); 8158+ SIMD_F32 variance_ = SIMD_LD_F32(variance + index); 8159+ SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); 8160+ SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); 8161+ SIMD_ST_F32(output + index, output_data); 8162+ } 8163+ return index; 8164+} 8165+ 8166+static inline int FusedBatchNormFp32NEON(int index, const float *input, const float *scale, 8167+ const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) { 8168+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8169+ SIMD_F32 input_data = SIMD_LD_F32(input + index); 8170+ SIMD_F32 scale_ = SIMD_LD_F32(scale + index); 8171+ SIMD_F32 offset_ = SIMD_LD_F32(offset + index); 8172+ SIMD_F32 mean_ = SIMD_LD_F32(mean + index); 8173+ SIMD_F32 variance_ = SIMD_LD_F32(variance + index); 8174+ SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); 8175+ SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); 8176+ SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_); 8177+ SIMD_ST_F32(output + index, output_data); 8178+ } 8179+ return index; 8180+} 8181+ 8182+#undef MS_SIMD_INSTRUCTION 8183+#undef BLOCK_NUM 8184+ 8185+#undef MS_SIMD_NEON 8186+#ifdef __cplusplus 8187+} 8188+#endif 8189+#endif 8190diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h 8191new file mode 100644 8192index 00000000..3f52857c 8193--- /dev/null 8194+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h 8195@@ -0,0 +1,68 @@ 8196+/** 8197+ * Copyright 2022 Huawei Technologies Co., Ltd 8198+ * 8199+ * Licensed under the Apache License, Version 2.0 (the "License"); 8200+ * you may not use this file except in compliance with the License. 8201+ * You may obtain a copy of the License at 8202+ * 8203+ * http://www.apache.org/licenses/LICENSE-2.0 8204+ * 8205+ * Unless required by applicable law or agreed to in writing, software 8206+ * distributed under the License is distributed on an "AS IS" BASIS, 8207+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8208+ * See the License for the specific language governing permissions and 8209+ * limitations under the License. 8210+ */ 8211+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_NEON_H_ 8212+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_NEON_H_ 8213+ 8214+#include "nnacl/intrinsics/ms_simd_instructions.h" 8215+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8216+ 8217+#ifdef __cplusplus 8218+extern "C" { 8219+#endif 8220+ 8221+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8222+#define BLOCK_NUM 4 8223+#define MS_SIMD_NEON 8224+ 8225+static inline int BCEWithLogitLossNEON(int index, const float *logits, const float *label, 8226+ const float *weight, const float *pos_weight, int length, bool reduction, float *output, 8227+ float *reduction_sum) { 8228+ SIMD_F32 zero = SIMD_SET0_F32; 8229+ SIMD_F32 ones = SIMD_MOV_F32(1.0f); 8230+ SIMD_F32 middle_output = SIMD_SET0_F32; 8231+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8232+ SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index); 8233+ SIMD_F32 label_tmp = SIMD_LD_F32(label + index); 8234+ SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index); 8235+ SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index); 8236+ SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp); 8237+ SIMD_F32 max_value = neg_logits_tmp; 8238+ max_value = SIMD_MIN_F32(max_value, zero); 8239+ SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value); 8240+ SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones); 8241+ SIMD_F32 log_exp_value = 8242+ SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value)))); 8243+ SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp), 8244+ SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value))); 8245+ if (reduction) { 8246+ middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output); 8247+ } else { 8248+ SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp)); 8249+ } 8250+ } 8251+ if (reduction) { 8252+ *reduction_sum += SIMD_GET_SUM_F32(middle_output); 8253+ } 8254+ return index; 8255+} 8256+#undef MS_SIMD_INSTRUCTION 8257+#undef BLOCK_NUM 8258+ 8259+#undef MS_SIMD_NEON 8260+#ifdef __cplusplus 8261+} 8262+#endif 8263+#endif 8264diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h 8265new file mode 100644 8266index 00000000..afaf0de5 8267--- /dev/null 8268+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h 8269@@ -0,0 +1,63 @@ 8270+/** 8271+ * Copyright 2022 Huawei Technologies Co., Ltd 8272+ * 8273+ * Licensed under the Apache License, Version 2.0 (the "License"); 8274+ * you may not use this file except in compliance with the License. 8275+ * You may obtain a copy of the License at 8276+ * 8277+ * http://www.apache.org/licenses/LICENSE-2.0 8278+ * 8279+ * Unless required by applicable law or agreed to in writing, software 8280+ * distributed under the License is distributed on an "AS IS" BASIS, 8281+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8282+ * See the License for the specific language governing permissions and 8283+ * limitations under the License. 8284+ */ 8285+ 8286+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_NEON_H_ 8287+#define MINDSPORE_NNACL_FP32_BIAS_ADD_NEON_H_ 8288+ 8289+#include "nnacl/intrinsics/ms_simd_instructions.h" 8290+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8291+ 8292+#ifdef __cplusplus 8293+extern "C" { 8294+#endif 8295+ 8296+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8297+#define BLOCK_NUM 4 8298+#define MS_SIMD_NEON 8299+ 8300+static inline int BiasAddByInnerCoreNEON(int index, const float *input, const float *bias, float *output, 8301+ int64_t num) { 8302+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8303+ SIMD_F32 vin0 = SIMD_LD_F32(input + index); 8304+ SIMD_F32 vin1 = SIMD_LD_F32(bias + index); 8305+ SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); 8306+ SIMD_ST_F32(output + index, vout); 8307+ } 8308+ return index; 8309+} 8310+ 8311+static inline int BiasAddByBatchCoreNEON(int index, const float *input, const float *bias, float *output1, 8312+ float *output2, float *output3, float *output4, int64_t num) { 8313+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8314+ SIMD_LDX4_F32(input_data, input + index, num); 8315+ SIMD_F32 bias_data = SIMD_LD_F32(bias + index); 8316+ SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data)); 8317+ SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data)); 8318+ SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data)); 8319+ SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data)); 8320+ } 8321+ return index; 8322+} 8323+ 8324+#undef MS_SIMD_INSTRUCTION 8325+#undef BLOCK_NUM 8326+ 8327+#undef MS_SIMD_NEON 8328+#ifdef __cplusplus 8329+}; 8330+#endif 8331+ 8332+#endif // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_ 8333diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h 8334new file mode 100644 8335index 00000000..8fe26687 8336--- /dev/null 8337+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h 8338@@ -0,0 +1,55 @@ 8339+/** 8340+ * Copyright 2022 Huawei Technologies Co., Ltd 8341+ * 8342+ * Licensed under the Apache License, Version 2.0 (the "License"); 8343+ * you may not use this file except in compliance with the License. 8344+ * You may obtain a copy of the License at 8345+ * 8346+ * http://www.apache.org/licenses/LICENSE-2.0 8347+ * 8348+ * Unless required by applicable law or agreed to in writing, software 8349+ * distributed under the License is distributed on an "AS IS" BASIS, 8350+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8351+ * See the License for the specific language governing permissions and 8352+ * limitations under the License. 8353+ */ 8354+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_NEON_H_ 8355+#define MINDSPORE_NNACL_BASE_CAST_BASE_NEON_H_ 8356+ 8357+#include "nnacl/intrinsics/ms_simd_instructions.h" 8358+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8359+ 8360+#ifdef __cplusplus 8361+extern "C" { 8362+#endif 8363+ 8364+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8365+#define BLOCK_NUM 4 8366+#define MS_SIMD_NEON 8367+ 8368+static inline int Int32ToFloat32NEON(int index, const int32_t *input, float *output, int number) { 8369+ for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8370+ SIMD_EPI32 value = SIMD_LD_EPI32(input + index); 8371+ SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value)); 8372+ } 8373+ return index; 8374+} 8375+ 8376+#ifndef MS_SIMD_NEON 8377+static inline int Float32ToInt32NEON(int index, const float *input, int32_t *output, int number) { 8378+ for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8379+ SIMD_F32 value = SIMD_LD_F32(input + index); 8380+ SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value)); 8381+ } 8382+ return index; 8383+} 8384+#endif 8385+ 8386+#undef MS_SIMD_INSTRUCTION 8387+#undef BLOCK_NUM 8388+ 8389+#undef MS_SIMD_NEON 8390+#ifdef __cplusplus 8391+} 8392+#endif 8393+#endif 8394diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h 8395new file mode 100644 8396index 00000000..09f55bbf 8397--- /dev/null 8398+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h 8399@@ -0,0 +1,69 @@ 8400+/** 8401+ * Copyright 2022 Huawei Technologies Co., Ltd 8402+ * 8403+ * Licensed under the Apache License, Version 2.0 (the "License"); 8404+ * you may not use this file except in compliance with the License. 8405+ * You may obtain a copy of the License at 8406+ * 8407+ * http://www.apache.org/licenses/LICENSE-2.0 8408+ * 8409+ * Unless required by applicable law or agreed to in writing, software 8410+ * distributed under the License is distributed on an "AS IS" BASIS, 8411+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8412+ * See the License for the specific language governing permissions and 8413+ * limitations under the License. 8414+ */ 8415+#ifndef MINDSPORE_NNACL_FP32_CDIST_NEON_H_ 8416+#define MINDSPORE_NNACL_FP32_CDIST_NEON_H_ 8417+ 8418+#include "nnacl/intrinsics/ms_simd_instructions.h" 8419+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8420+ 8421+#ifdef __cplusplus 8422+extern "C" { 8423+#endif 8424+ 8425+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8426+#define BLOCK_NUM 4 8427+#define MS_SIMD_NEON 8428+ 8429+static inline int64_t CdistTwoNormalOptNEON(int64_t index, const float *a, const float *b, 8430+ float *out, int64_t size) { 8431+ SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); 8432+ for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8433+ SIMD_F32 a_vec = SIMD_LD_F32(a + index); 8434+ SIMD_F32 b_vec = SIMD_LD_F32(b + index); 8435+ SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); 8436+ tmp_vec = SIMD_ABS_F32(tmp_vec); 8437+ result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec); 8438+ } 8439+ *out += SIMD_GET_SUM_F32(result_vec); 8440+ 8441+ return index; 8442+} 8443+ 8444+static inline int64_t CdistPNormalOptNEON(int64_t index, const float *a, const float *b, 8445+ float *out, int64_t size, float p) { 8446+ SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); 8447+ SIMD_F32 p_vec = SIMD_MOV_F32(p); 8448+ for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8449+ SIMD_F32 a_vec = SIMD_LD_F32(a + index); 8450+ SIMD_F32 b_vec = SIMD_LD_F32(b + index); 8451+ SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); 8452+ tmp_vec = SIMD_ABS_F32(tmp_vec); 8453+ tmp_vec = SIMD_POW_F32(tmp_vec, p_vec); 8454+ result_vec = SIMD_ADD_F32(tmp_vec, result_vec); 8455+ } 8456+ *out += SIMD_GET_SUM_F32(result_vec); 8457+ 8458+ return index; 8459+} 8460+ 8461+#undef MS_SIMD_INSTRUCTION 8462+#undef BLOCK_NUM 8463+ 8464+#undef MS_SIMD_NEON 8465+#ifdef __cplusplus 8466+} 8467+#endif 8468+#endif 8469diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h 8470new file mode 100644 8471index 00000000..d8a2580a 8472--- /dev/null 8473+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h 8474@@ -0,0 +1,120 @@ 8475+/** 8476+ * Copyright 2022 Huawei Technologies Co., Ltd 8477+ * 8478+ * Licensed under the Apache License, Version 2.0 (the "License"); 8479+ * you may not use this file except in compliance with the License. 8480+ * You may obtain a copy of the License at 8481+ * 8482+ * http://www.apache.org/licenses/LICENSE-2.0 8483+ * 8484+ * Unless required by applicable law or agreed to in writing, software 8485+ * distributed under the License is distributed on an "AS IS" BASIS, 8486+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8487+ * See the License for the specific language governing permissions and 8488+ * limitations under the License. 8489+ */ 8490+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_NEON_H_ 8491+#define MINDSPORE_NNACL_FP32_CUMSUM_NEON_H_ 8492+ 8493+#include "nnacl/intrinsics/ms_simd_instructions.h" 8494+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8495+ 8496+#ifdef __cplusplus 8497+extern "C" { 8498+#endif 8499+ 8500+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8501+#define BLOCK_NUM 4 8502+#define MS_SIMD_NEON 8503+ 8504+// (a, b, c) -> (a, a+b, a+b+c) exclusive == false 8505+// (a, b, c) -> (0, a, a+b) exclusive == true 8506+static inline int64_t CumsumOutputInitWithInputNEON(int64_t index, const float *layer_input, 8507+ float *layer_output, int inner_dim) { 8508+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8509+ SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index)); 8510+ } 8511+ return index; 8512+} 8513+ 8514+static inline int64_t CumsumOutputInitWithZeroNEON(int64_t index, float *layer_output, int inner_dim) { 8515+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8516+ SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f)); 8517+ } 8518+ return index; 8519+} 8520+ 8521+static inline int64_t CumsumNEON(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output, 8522+ int inner_dim) { 8523+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8524+ SIMD_F32 input_val = SIMD_LD_F32(layer_input + index); 8525+ SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index); 8526+ SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); 8527+ SIMD_ST_F32(layer_output + index, out_val); 8528+ } 8529+ return index; 8530+} 8531+ 8532+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false 8533+// (a, b, c) -> (c+b, c, 0) exclusive==true 8534+static inline int64_t CumsumReverseNEON(int64_t index, const float *layer_input, float *layer_output, 8535+ float *layer_last_output, int inner_dim) { 8536+ 8537+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8538+ SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1); 8539+ SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1); 8540+ SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); 8541+ SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val); 8542+ } 8543+ return index; 8544+} 8545+ 8546+// (a, b, c) -> (a, a+b, a+b+c) exclusive == false 8547+// (a, b, c) -> (0, a, a+b) exclusive == true 8548+static inline int64_t CumsumIntOutputInitWithInputNEON(int64_t index, const int *layer_input, 8549+ int *layer_output, int inner_dim) { 8550+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8551+ SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index)); 8552+ } 8553+ return index; 8554+} 8555+ 8556+static inline int64_t CumsumIntOutputInitWithZeroNEON(int64_t index, int *layer_output, int inner_dim) { 8557+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8558+ SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f)); 8559+ } 8560+ return index; 8561+} 8562+ 8563+static inline int64_t CumsumIntNEON(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, 8564+ int inner_dim) { 8565+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8566+ SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index); 8567+ SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index); 8568+ SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); 8569+ SIMD_ST_EPI32(layer_output + index, out_val); 8570+ } 8571+ return index; 8572+} 8573+ 8574+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false 8575+// (a, b, c) -> (c+b, c, 0) exclusive==true 8576+static inline int64_t CumsumReverseIntNEON(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, 8577+ int inner_dim) { 8578+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8579+ SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1); 8580+ SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1); 8581+ SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); 8582+ SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val); 8583+ } 8584+ return index; 8585+} 8586+ 8587+#undef MS_SIMD_INSTRUCTION 8588+#undef BLOCK_NUM 8589+ 8590+#undef MS_SIMD_NEON 8591+#ifdef __cplusplus 8592+} 8593+#endif 8594+#endif 8595diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h 8596new file mode 100644 8597index 00000000..c4ce6594 8598--- /dev/null 8599+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h 8600@@ -0,0 +1,166 @@ 8601+/** 8602+ * Copyright 2022 Huawei Technologies Co., Ltd 8603+ * 8604+ * Licensed under the Apache License, Version 2.0 (the "License"); 8605+ * you may not use this file except in compliance with the License. 8606+ * You may obtain a copy of the License at 8607+ * 8608+ * http://www.apache.org/licenses/LICENSE-2.0 8609+ * 8610+ * Unless required by applicable law or agreed to in writing, software 8611+ * distributed under the License is distributed on an "AS IS" BASIS, 8612+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8613+ * See the License for the specific language governing permissions and 8614+ * limitations under the License. 8615+ */ 8616+ 8617+#ifndef MINDSPORE_NNACL_FP32_DIV_NEON_H_ 8618+#define MINDSPORE_NNACL_FP32_DIV_NEON_H_ 8619+ 8620+#include "nnacl/intrinsics/ms_simd_instructions.h" 8621+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8622+ 8623+#ifdef __cplusplus 8624+extern "C" { 8625+#endif 8626+ 8627+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8628+#define BLOCK_NUM 4 8629+#define MS_SIMD_NEON 8630+ 8631+static inline int ElementOptDivNum0NEON(int index, const float *in0, const float *in1, float *out, 8632+ int size) { 8633+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 8634+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8635+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 8636+ SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1); 8637+ SIMD_ST_F32(out + index, vout); 8638+ } 8639+ return index; 8640+} 8641+ 8642+static inline int ElementOptDivNum1NEON(int index, const float *in0, const float *in1, float *out, 8643+ int size) { 8644+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 8645+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8646+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 8647+ SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_); 8648+ SIMD_ST_F32(out + index, vout); 8649+ } 8650+ return index; 8651+} 8652+ 8653+static inline int ElementOptDivIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { 8654+ SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); 8655+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8656+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 8657+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1); 8658+ SIMD_ST_EPI32(out + index, vout); 8659+ } 8660+ return index; 8661+} 8662+ 8663+static inline int ElementOptDivIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { 8664+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 8665+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8666+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 8667+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_); 8668+ SIMD_ST_EPI32(out + index, vout); 8669+ } 8670+ return index; 8671+} 8672+ 8673+static inline int ElementOptDivReluNum0NEON(int index, const float *in0, const float *in1, float *out, 8674+ int size) { 8675+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 8676+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8677+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 8678+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f); 8679+ SIMD_ST_F32(out + index, vout); 8680+ } 8681+ return index; 8682+} 8683+ 8684+static inline int ElementOptDivReluNum1NEON(int index, const float *in0, const float *in1, float *out, 8685+ int size) { 8686+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 8687+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8688+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 8689+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f); 8690+ SIMD_ST_F32(out + index, vout); 8691+ } 8692+ return index; 8693+} 8694+ 8695+static inline int ElementOptDivRelu6Num0NEON(int index, const float *in0, const float *in1, float *out, 8696+ int size) { 8697+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 8698+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8699+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 8700+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f); 8701+ SIMD_ST_F32(out + index, vout); 8702+ } 8703+ return index; 8704+} 8705+ 8706+static inline int ElementOptDivRelu6Num1NEON(int index, const float *in0, const float *in1, float *out, 8707+ int size) { 8708+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 8709+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8710+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 8711+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f); 8712+ SIMD_ST_F32(out + index, vout); 8713+ } 8714+ return index; 8715+} 8716+ 8717+static inline int ElementDivNEON(int index, const float *in0, const float *in1, float *out, int size) { 8718+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8719+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 8720+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 8721+ SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1); 8722+ SIMD_ST_F32(out + index, vout); 8723+ } 8724+ return index; 8725+} 8726+ 8727+static inline int ElementDivIntNEON(int index, const int *in0, const int *in1, int *out, int size) { 8728+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8729+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 8730+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 8731+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1); 8732+ SIMD_ST_EPI32(out + index, vout); 8733+ } 8734+ return index; 8735+} 8736+ 8737+static inline int ElementDivReluNEON(int index, const float *in0, const float *in1, float *out, 8738+ int size) { 8739+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8740+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 8741+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 8742+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f); 8743+ SIMD_ST_F32(out + index, vout); 8744+ } 8745+ return index; 8746+} 8747+ 8748+static inline int ElementDivRelu6NEON(int index, const float *in0, const float *in1, float *out, 8749+ int size) { 8750+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8751+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 8752+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 8753+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f); 8754+ SIMD_ST_F32(out + index, vout); 8755+ } 8756+ return index; 8757+} 8758+ 8759+#undef MS_SIMD_INSTRUCTION 8760+#undef BLOCK_NUM 8761+ 8762+#undef MS_SIMD_NEON 8763+#ifdef __cplusplus 8764+}; 8765+#endif 8766+#endif 8767diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h 8768new file mode 100644 8769index 00000000..b71db336 8770--- /dev/null 8771+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h 8772@@ -0,0 +1,45 @@ 8773+/** 8774+ * Copyright 2022 Huawei Technologies Co., Ltd 8775+ * 8776+ * Licensed under the Apache License, Version 2.0 (the "License"); 8777+ * you may not use this file except in compliance with the License. 8778+ * You may obtain a copy of the License at 8779+ * 8780+ * http://www.apache.org/licenses/LICENSE-2.0 8781+ * 8782+ * Unless required by applicable law or agreed to in writing, software 8783+ * distributed under the License is distributed on an "AS IS" BASIS, 8784+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8785+ * See the License for the specific language governing permissions and 8786+ * limitations under the License. 8787+ */ 8788+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_NEON_H_ 8789+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_NEON_H_ 8790+ 8791+#include "nnacl/intrinsics/ms_simd_instructions.h" 8792+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8793+ 8794+#ifdef __cplusplus 8795+extern "C" { 8796+#endif 8797+ 8798+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8799+#define BLOCK_NUM 4 8800+#define MS_SIMD_NEON 8801+ 8802+static inline int DropoutFp32NEON(int index, const float *input, float scale, 8803+ int length, float *output) { 8804+ SIMD_F32 scale_value = SIMD_MOV_F32(scale); 8805+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8806+ SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value)); 8807+ } 8808+ return index; 8809+} 8810+#undef MS_SIMD_INSTRUCTION 8811+#undef BLOCK_NUM 8812+ 8813+#undef MS_SIMD_NEON 8814+#ifdef __cplusplus 8815+} 8816+#endif 8817+#endif 8818diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h 8819new file mode 100644 8820index 00000000..a594abd2 8821--- /dev/null 8822+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h 8823@@ -0,0 +1,62 @@ 8824+/** 8825+ * Copyright 2022 Huawei Technologies Co., Ltd 8826+ * 8827+ * Licensed under the Apache License, Version 2.0 (the "License"); 8828+ * you may not use this file except in compliance with the License. 8829+ * You may obtain a copy of the License at 8830+ * 8831+ * http://www.apache.org/licenses/LICENSE-2.0 8832+ * 8833+ * Unless required by applicable law or agreed to in writing, software 8834+ * distributed under the License is distributed on an "AS IS" BASIS, 8835+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8836+ * See the License for the specific language governing permissions and 8837+ * limitations under the License. 8838+ */ 8839+ 8840+#ifndef MINDSPORE_NNACL_FP32_DIV_NEON_H_ 8841+#define MINDSPORE_NNACL_FP32_DIV_NEON_H_ 8842+ 8843+#include "nnacl/intrinsics/ms_simd_instructions.h" 8844+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8845+ 8846+#ifdef __cplusplus 8847+extern "C" { 8848+#endif 8849+ 8850+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8851+#define BLOCK_NUM 4 8852+#define MS_SIMD_NEON 8853+ 8854+static inline int64_t ExpFp32NEON(int64_t index, const float *src, float *dst, int num) { 8855+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8856+ SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); 8857+ } 8858+ return index; 8859+} 8860+ 8861+static inline int64_t ExpFp32WithInScaleNEON(int64_t index, const float *src, float *dst, int num, float in_scale) { 8862+ SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale); 8863+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8864+ SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index); 8865+ } 8866+ return index; 8867+} 8868+ 8869+static inline int64_t ExpFp32WithOutScaleNEON(int64_t index, const float *src, float *dst, int num, float out_scale) { 8870+ SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale); 8871+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8872+ SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); 8873+ SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec)); 8874+ } 8875+ return index; 8876+} 8877+ 8878+#undef MS_SIMD_INSTRUCTION 8879+#undef BLOCK_NUM 8880+ 8881+#undef MS_SIMD_NEON 8882+#ifdef __cplusplus 8883+}; 8884+#endif 8885+#endif 8886diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h 8887new file mode 100644 8888index 00000000..c467d2d9 8889--- /dev/null 8890+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h 8891@@ -0,0 +1,52 @@ 8892+/** 8893+ * Copyright 2022 Huawei Technologies Co., Ltd 8894+ * 8895+ * Licensed under the Apache License, Version 2.0 (the "License"); 8896+ * you may not use this file except in compliance with the License. 8897+ * You may obtain a copy of the License at 8898+ * 8899+ * http://www.apache.org/licenses/LICENSE-2.0 8900+ * 8901+ * Unless required by applicable law or agreed to in writing, software 8902+ * distributed under the License is distributed on an "AS IS" BASIS, 8903+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8904+ * See the License for the specific language governing permissions and 8905+ * limitations under the License. 8906+ */ 8907+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_NEON_H_ 8908+#define MINDSPORE_NNACL_BASE_FILL_BASE_NEON_H_ 8909+ 8910+#include "nnacl/intrinsics/ms_simd_instructions.h" 8911+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8912+ 8913+#ifdef __cplusplus 8914+extern "C" { 8915+#endif 8916+ 8917+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8918+#define BLOCK_NUM 4 8919+#define MS_SIMD_NEON 8920+ 8921+static inline int FillFp32NEON(int index, float *output, int size, float data) { 8922+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8923+ SIMD_ST_F32(output + index, SIMD_MOV_F32(data)); 8924+ } 8925+ return index; 8926+} 8927+ 8928+static inline int FillInt32NEON(int index, int *output, int size, int data) { 8929+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8930+ SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data)); 8931+ } 8932+ return index; 8933+} 8934+ 8935+#undef MS_SIMD_INSTRUCTION 8936+#undef BLOCK_NUM 8937+ 8938+#undef MS_SIMD_NEON 8939+#ifdef __cplusplus 8940+} 8941+#endif 8942+#endif 8943+ 8944diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h 8945new file mode 100644 8946index 00000000..0eb6c9d2 8947--- /dev/null 8948+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h 8949@@ -0,0 +1,76 @@ 8950+/** 8951+ * Copyright 2022 Huawei Technologies Co., Ltd 8952+ * 8953+ * Licensed under the Apache License, Version 2.0 (the "License"); 8954+ * you may not use this file except in compliance with the License. 8955+ * You may obtain a copy of the License at 8956+ * 8957+ * http://www.apache.org/licenses/LICENSE-2.0 8958+ * 8959+ * Unless required by applicable law or agreed to in writing, software 8960+ * distributed under the License is distributed on an "AS IS" BASIS, 8961+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 8962+ * See the License for the specific language governing permissions and 8963+ * limitations under the License. 8964+ */ 8965+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_NEON_H_ 8966+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_NEON_H_ 8967+ 8968+#include "nnacl/intrinsics/ms_simd_instructions.h" 8969+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 8970+ 8971+#ifdef __cplusplus 8972+extern "C" { 8973+#endif 8974+ 8975+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 8976+#define BLOCK_NUM 4 8977+#define MS_SIMD_NEON 8978+ 8979+static inline int64_t GroupNormFp32NEON(int64_t index, const float *unit_input, float scale, float offset, float mean, 8980+ float var_sqrt, int unit, float *unit_output) { 8981+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 8982+ SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt); 8983+ SIMD_F32 scale_val = SIMD_MOV_F32(scale); 8984+ SIMD_F32 offset_val = SIMD_MOV_F32(offset); 8985+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8986+ SIMD_F32 input = SIMD_LD_F32(unit_input + index); 8987+ SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt); 8988+ SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val); 8989+ SIMD_ST_F32(unit_output + index, output); 8990+ } 8991+ return index; 8992+} 8993+ 8994+static inline int64_t GroupNormReduceSumNEON(int64_t index, const float *in, float *sum, int unit) { 8995+ if (unit - index >= 4 * BLOCK_NUM) { 8996+ SIMD_F32 tmp = SIMD_MOV_F32(0); 8997+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 8998+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index)); 8999+ } 9000+ *sum += SIMD_GET_SUM_F32(tmp); 9001+ } 9002+ return index; 9003+} 9004+ 9005+static inline int64_t GroupNormReduceVarNEON(int64_t index, const float *in, float mean, float *sum, int unit) { 9006+ if (unit - index >= 4 * BLOCK_NUM) { 9007+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 9008+ SIMD_F32 tmp = SIMD_MOV_F32(0); 9009+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9010+ SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val); 9011+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input)); 9012+ } 9013+ *sum += SIMD_GET_SUM_F32(tmp); 9014+ } 9015+ return index; 9016+} 9017+ 9018+#undef MS_SIMD_INSTRUCTION 9019+#undef BLOCK_NUM 9020+ 9021+#undef MS_SIMD_NEON 9022+#ifdef __cplusplus 9023+} 9024+#endif 9025+#endif 9026diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h 9027new file mode 100644 9028index 00000000..0c528616 9029--- /dev/null 9030+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h 9031@@ -0,0 +1,67 @@ 9032+/** 9033+ * Copyright 2022 Huawei Technologies Co., Ltd 9034+ * 9035+ * Licensed under the Apache License, Version 2.0 (the "License"); 9036+ * you may not use this file except in compliance with the License. 9037+ * You may obtain a copy of the License at 9038+ * 9039+ * http://www.apache.org/licenses/LICENSE-2.0 9040+ * 9041+ * Unless required by applicable law or agreed to in writing, software 9042+ * distributed under the License is distributed on an "AS IS" BASIS, 9043+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9044+ * See the License for the specific language governing permissions and 9045+ * limitations under the License. 9046+ */ 9047+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_NEON_H_ 9048+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_NEON_H_ 9049+ 9050+#include "nnacl/intrinsics/ms_simd_instructions.h" 9051+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 9052+ 9053+#ifdef __cplusplus 9054+extern "C" { 9055+#endif 9056+ 9057+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 9058+#define BLOCK_NUM 4 9059+#define MS_SIMD_NEON 9060+ 9061+static inline int LayerNormMeanAndSquareNEON(int index, const float *src, int num, float *mean, float *square_mean) { 9062+ if (num >= 4 * BLOCK_NUM) { 9063+ SIMD_F32 sum_val = SIMD_SET0_F32; 9064+ SIMD_F32 square_sum_val = SIMD_SET0_F32; 9065+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9066+ SIMD_F32 value = SIMD_LD_F32(src + index); 9067+ SIMD_F32 square_value = SIMD_MUL_F32(value, value); 9068+ sum_val = SIMD_ADD_F32(sum_val, value); 9069+ square_sum_val = SIMD_ADD_F32(square_sum_val, square_value); 9070+ } 9071+ *mean += SIMD_GET_SUM_F32(sum_val); 9072+ *square_mean += SIMD_GET_SUM_F32(square_sum_val); 9073+ } 9074+ return index; 9075+} 9076+ 9077+static inline int LayerNormGammaAndBetaNEON(int index, float *dst, const float *src, const float *gamma_data, 9078+ const float *beta_data, int num, const float mean, const float deno) { 9079+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 9080+ SIMD_F32 deno_val = SIMD_MOV_F32(deno); 9081+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9082+ SIMD_F32 value = SIMD_LD_F32(src + index); 9083+ SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val); 9084+ out_value = SIMD_MUL_F32(out_value, deno_val); 9085+ out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index)); 9086+ SIMD_ST_F32(dst + index, out_value); 9087+ } 9088+ return index; 9089+} 9090+ 9091+#undef MS_SIMD_INSTRUCTION 9092+#undef BLOCK_NUM 9093+ 9094+#undef MS_SIMD_NEON 9095+#ifdef __cplusplus 9096+} 9097+#endif 9098+#endif 9099diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h 9100new file mode 100644 9101index 00000000..0e12e5a0 9102--- /dev/null 9103+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h 9104@@ -0,0 +1,92 @@ 9105+/** 9106+ * Copyright 2022 Huawei Technologies Co., Ltd 9107+ * 9108+ * Licensed under the Apache License, Version 2.0 (the "License"); 9109+ * you may not use this file except in compliance with the License. 9110+ * You may obtain a copy of the License at 9111+ * 9112+ * http://www.apache.org/licenses/LICENSE-2.0 9113+ * 9114+ * Unless required by applicable law or agreed to in writing, software 9115+ * distributed under the License is distributed on an "AS IS" BASIS, 9116+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9117+ * See the License for the specific language governing permissions and 9118+ * limitations under the License. 9119+ */ 9120+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_NEON_H_ 9121+#define MINDSPORE_NNACL_FP32_MATMUL_F32_NEON_H_ 9122+ 9123+#include "nnacl/intrinsics/ms_simd_instructions.h" 9124+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 9125+ 9126+#ifdef __cplusplus 9127+extern "C" { 9128+#endif 9129+ 9130+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 9131+#define BLOCK_NUM 4 9132+#define MS_SIMD_NEON 9133+ 9134+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6. 9135+static inline int64_t GemmIsNotPackNEON(int64_t index, const float *a, const float *b, float *c, const float *bias, int row, 9136+ int deep, int act_type) { 9137+ SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f); 9138+ SIMD_F32 up_threshold = SIMD_MOV_F32(6); 9139+ SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]); 9140+ SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]); 9141+ for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9142+ SIMD_F32 a_data = SIMD_LD_F32(a + index); 9143+ SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16); 9144+ if (act_type != 0) { 9145+ dst = SIMD_MAX_F32(dst, down_threshold); 9146+ if (act_type == 3) { 9147+ dst = SIMD_MIN_F32(dst, up_threshold); 9148+ } 9149+ } 9150+ SIMD_ST_F32(c + index, dst); 9151+ } 9152+ 9153+ return index; 9154+} 9155+ 9156+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX) 9157+static inline int64_t GemmIsNotPackOptimizeCoreNEON(int64_t index, const float *a, const float *b, int k, float *dst) { 9158+ SIMD_F32 dst1 = SIMD_MOV_F32(0.0f); 9159+ for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9160+ SIMD_F32 weight = SIMD_LD_F32(b + index); 9161+ SIMD_F32 a1 = SIMD_LD_F32(a + index); 9162+ dst1 = SIMD_FMADD_F32(weight, a1, dst1); 9163+ } 9164+ *dst += SIMD_REDUCE_ADD_F32(dst1); 9165+ return index; 9166+} 9167+#endif 9168+ 9169+static inline int64_t MatVecMulNoPackCoreNEON(int64_t oc_index, const float *a, const float *b, float *c, const float *bias, 9170+ int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) { 9171+ for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) { 9172+ SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index)); 9173+ for (int64_t k = 0; k < depth; ++k) { 9174+ SIMD_F32 left = SIMD_MOV_F32(a[k]); 9175+ SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col); 9176+ out = SIMD_FMADD_F32(left, right, out); 9177+ } 9178+ if ((inc_flag & 0x2) != 0 && act_type != 0) { 9179+ out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f)); 9180+ if (act_type == 0x3) { 9181+ out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f)); 9182+ } 9183+ } 9184+ SIMD_ST_F32(c + oc_index, out); 9185+ } 9186+ return oc_index; 9187+} 9188+ 9189+#undef MS_SIMD_INSTRUCTION 9190+#undef BLOCK_NUM 9191+ 9192+#undef MS_SIMD_NEON 9193+#ifdef __cplusplus 9194+} 9195+#endif 9196+#endif 9197diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h 9198new file mode 100644 9199index 00000000..33506e0c 9200--- /dev/null 9201+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h 9202@@ -0,0 +1,217 @@ 9203+/** 9204+ * Copyright 2022 Huawei Technologies Co., Ltd 9205+ * 9206+ * Licensed under the Apache License, Version 2.0 (the "License"); 9207+ * you may not use this file except in compliance with the License. 9208+ * You may obtain a copy of the License at 9209+ * 9210+ * http://www.apache.org/licenses/LICENSE-2.0 9211+ * 9212+ * Unless required by applicable law or agreed to in writing, software 9213+ * distributed under the License is distributed on an "AS IS" BASIS, 9214+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9215+ * See the License for the specific language governing permissions and 9216+ * limitations under the License. 9217+ */ 9218+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ 9219+#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ 9220+ 9221+#include "nnacl/intrinsics/ms_simd_instructions.h" 9222+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 9223+ 9224+#ifdef __cplusplus 9225+extern "C" { 9226+#endif 9227+ 9228+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 9229+#define BLOCK_NUM 4 9230+#define MS_SIMD_NEON 9231+ 9232+static inline int ElementMulNEON(int index, const float *in0, const float *in1, float *out, int size) { 9233+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9234+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 9235+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 9236+ SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1); 9237+ SIMD_ST_F32(out + index, vout); 9238+ } 9239+ return index; 9240+} 9241+ 9242+static inline int ElementMulReluNEON(int index, const float *in0, const float *in1, float *out, int size) { 9243+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9244+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 9245+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 9246+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f); 9247+ SIMD_ST_F32(out + index, vout); 9248+ } 9249+ return index; 9250+} 9251+ 9252+static inline int ElementMulRelu6NEON(int index, const float *in0, const float *in1, float *out, int size) { 9253+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9254+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 9255+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 9256+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f); 9257+ SIMD_ST_F32(out + index, vout); 9258+ } 9259+ return index; 9260+} 9261+ 9262+static inline int ElementMulIntNEON(int index, const int *in0, const int *in1, int *out, int size) { 9263+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9264+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 9265+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 9266+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1); 9267+ SIMD_ST_EPI32(out + index, vout); 9268+ } 9269+ return index; 9270+} 9271+ 9272+static inline int ElementMulReluIntNEON(int index, const int *in0, const int *in1, int *out, int size) { 9273+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9274+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 9275+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 9276+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f); 9277+ SIMD_ST_EPI32(out + index, vout); 9278+ } 9279+ return index; 9280+} 9281+ 9282+static inline int ElementMulRelu6IntNEON(int index, const int *in0, const int *in1, int *out, int size) { 9283+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9284+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 9285+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 9286+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f); 9287+ SIMD_ST_EPI32(out + index, vout); 9288+ } 9289+ return index; 9290+} 9291+ 9292+static inline int ElementOptMulNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { 9293+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 9294+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9295+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 9296+ SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1); 9297+ SIMD_ST_F32(out + index, vout); 9298+ } 9299+ return index; 9300+} 9301+ 9302+static inline int ElementOptMulNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { 9303+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 9304+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9305+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 9306+ SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_); 9307+ SIMD_ST_F32(out + index, vout); 9308+ } 9309+ return index; 9310+} 9311+ 9312+static inline int ElementOptMulReluNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { 9313+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 9314+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9315+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 9316+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f); 9317+ SIMD_ST_F32(out + index, vout); 9318+ } 9319+ return index; 9320+} 9321+ 9322+static inline int ElementOptMulReluNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { 9323+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 9324+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9325+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 9326+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f); 9327+ SIMD_ST_F32(out + index, vout); 9328+ } 9329+ return index; 9330+} 9331+ 9332+static inline int ElementOptMulRelu6Num0NEON(int index, const float *in0, const float *in1, float *out, int size) { 9333+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 9334+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9335+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 9336+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f); 9337+ SIMD_ST_F32(out + index, vout); 9338+ } 9339+ return index; 9340+} 9341+ 9342+static inline int ElementOptMulRelu6Num1NEON(int index, const float *in0, const float *in1, float *out, int size) { 9343+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 9344+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9345+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 9346+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f); 9347+ SIMD_ST_F32(out + index, vout); 9348+ } 9349+ return index; 9350+} 9351+ 9352+static inline int ElementOptMulIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { 9353+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 9354+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9355+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 9356+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1); 9357+ SIMD_ST_EPI32(out + index, vout); 9358+ } 9359+ return index; 9360+} 9361+ 9362+static inline int ElementOptMulIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { 9363+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 9364+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9365+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 9366+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_); 9367+ SIMD_ST_EPI32(out + index, vout); 9368+ } 9369+ return index; 9370+} 9371+ 9372+static inline int ElementOptMulReluIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { 9373+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 9374+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9375+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 9376+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f); 9377+ SIMD_ST_EPI32(out + index, vout); 9378+ } 9379+ return index; 9380+} 9381+ 9382+static inline int ElementOptMulReluIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { 9383+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 9384+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9385+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 9386+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f); 9387+ SIMD_ST_EPI32(out + index, vout); 9388+ } 9389+ return index; 9390+} 9391+ 9392+static inline int ElementOptMulRelu6IntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { 9393+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 9394+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9395+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 9396+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f); 9397+ SIMD_ST_EPI32(out + index, vout); 9398+ } 9399+ return index; 9400+} 9401+ 9402+static inline int ElementOptMulRelu6IntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { 9403+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 9404+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9405+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 9406+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f); 9407+ SIMD_ST_EPI32(out + index, vout); 9408+ } 9409+ return index; 9410+} 9411+ 9412+#undef MS_SIMD_INSTRUCTION 9413+#undef BLOCK_NUM 9414+ 9415+#undef MS_SIMD_NEON 9416+#ifdef __cplusplus 9417+} 9418+#endif 9419+#endif 9420diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h 9421new file mode 100644 9422index 00000000..ea6acf62 9423--- /dev/null 9424+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h 9425@@ -0,0 +1,83 @@ 9426+/** 9427+ * Copyright 2022 Huawei Technologies Co., Ltd 9428+ * 9429+ * Licensed under the Apache License, Version 2.0 (the "License"); 9430+ * you may not use this file except in compliance with the License. 9431+ * You may obtain a copy of the License at 9432+ * 9433+ * http://www.apache.org/licenses/LICENSE-2.0 9434+ * 9435+ * Unless required by applicable law or agreed to in writing, software 9436+ * distributed under the License is distributed on an "AS IS" BASIS, 9437+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9438+ * See the License for the specific language governing permissions and 9439+ * limitations under the License. 9440+ */ 9441+#ifndef MINDSPORE_NNACL_FP32_POOLING_NEON_H_ 9442+#define MINDSPORE_NNACL_FP32_POOLING_NEON_H_ 9443+ 9444+#include "nnacl/intrinsics/ms_simd_instructions.h" 9445+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 9446+ 9447+#ifdef __cplusplus 9448+extern "C" { 9449+#endif 9450+ 9451+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 9452+#define BLOCK_NUM 4 9453+#define MS_SIMD_NEON 9454+ 9455+static inline int AvgPoolingBatchNEON(int ci, const float *src_plane_ptr, int channel, 9456+ float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, 9457+ int in_h_index, int in_w, int in_w_index, float minf, float maxf) { 9458+ SIMD_F32 min_val = SIMD_MOV_F32(minf); 9459+ SIMD_F32 max_val = SIMD_MOV_F32(maxf); 9460+ for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { 9461+ const float *src_c_ptr = src_plane_ptr + ci; 9462+ float *dst_c_ptr = dst_plane_ptr + ci; 9463+ SIMD_F32 tmp_avg = SIMD_SET0_F32; 9464+ int real_count = 0; 9465+ for (int h = real_win_h_start; h < real_win_h_end; h++) { 9466+ for (int w = real_win_w_start; w < real_win_w_end; w++) { 9467+ const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; 9468+ tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr)); 9469+ ++real_count; 9470+ } 9471+ } 9472+ tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count)); 9473+ tmp_avg = SIMD_MAX_F32(tmp_avg, min_val); 9474+ tmp_avg = SIMD_MIN_F32(tmp_avg, max_val); 9475+ SIMD_ST_F32(dst_c_ptr, tmp_avg); 9476+ } 9477+ return ci; 9478+} 9479+ 9480+static inline int MaxPoolingBatchNEON(int ci, const float *src_plane_ptr, int channel, 9481+ float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, 9482+ int in_h_index, int in_w, int in_w_index, float minf, float maxf) { 9483+ SIMD_F32 min_val = SIMD_MOV_F32(minf); 9484+ SIMD_F32 max_val = SIMD_MOV_F32(maxf); 9485+ for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { 9486+ const float *src_c_ptr = src_plane_ptr + ci; 9487+ float *dst_c_ptr = dst_plane_ptr + ci; 9488+ SIMD_F32 tmp_max = min_val; 9489+ for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { 9490+ for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { 9491+ const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; 9492+ tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr)); 9493+ } 9494+ } 9495+ tmp_max = SIMD_MIN_F32(tmp_max, max_val); 9496+ SIMD_ST_F32(dst_c_ptr, tmp_max); 9497+ } 9498+ return ci; 9499+} 9500+ 9501+#undef MS_SIMD_INSTRUCTION 9502+#undef BLOCK_NUM 9503+ 9504+#undef MS_SIMD_NEON 9505+#ifdef __cplusplus 9506+} 9507+#endif 9508+#endif 9509diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h 9510new file mode 100644 9511index 00000000..fd8699c7 9512--- /dev/null 9513+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h 9514@@ -0,0 +1,100 @@ 9515+/** 9516+ * Copyright 2022 Huawei Technologies Co., Ltd 9517+ * 9518+ * Licensed under the Apache License, Version 2.0 (the "License"); 9519+ * you may not use this file except in compliance with the License. 9520+ * You may obtain a copy of the License at 9521+ * 9522+ * http://www.apache.org/licenses/LICENSE-2.0 9523+ * 9524+ * Unless required by applicable law or agreed to in writing, software 9525+ * distributed under the License is distributed on an "AS IS" BASIS, 9526+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9527+ * See the License for the specific language governing permissions and 9528+ * limitations under the License. 9529+ */ 9530+#ifndef MINDSPORE_NNACL_FP32_POWER_NEON_H_ 9531+#define MINDSPORE_NNACL_FP32_POWER_NEON_H_ 9532+ 9533+#include "nnacl/intrinsics/ms_simd_instructions.h" 9534+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 9535+ 9536+#ifdef __cplusplus 9537+extern "C" { 9538+#endif 9539+ 9540+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 9541+#define BLOCK_NUM 4 9542+#define MS_SIMD_NEON 9543+ 9544+static inline int PowerBroadCastIntExponentNEON(int index, const float *input, int exponent, float *output, int len, 9545+ float scale, float shift) { 9546+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 9547+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 9548+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9549+ SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 9550+ SIMD_F32 result = SIMD_MOV_F32(1.0f); 9551+ int exp = abs(exponent); 9552+ while (exp) { 9553+ if (exp % 2) { 9554+ result = SIMD_MUL_F32(result, tmp); 9555+ } 9556+ tmp = SIMD_MUL_SQUARE_F32(tmp); 9557+ exp = exp / 2; 9558+ } 9559+ SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result)); 9560+ } 9561+ return index; 9562+} 9563+ 9564+static inline int PowerBroadCastFloatExponentNEON(int index, const float *input, float exponent, float *output, int len, 9565+ float scale, float shift) { 9566+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 9567+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 9568+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9569+ SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 9570+ SIMD_F32 result; 9571+ for (int i = 0; i < BLOCK_NUM; ++i) { 9572+ SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent); 9573+ } 9574+ SIMD_ST_F32(output + index, result); 9575+ } 9576+ return index; 9577+} 9578+ 9579+static inline int PowerSingleExponentNEON(int index, const float *input, const float *exponent, float *output, int len, 9580+ float scale, float shift) { 9581+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 9582+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 9583+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9584+ SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 9585+ for (int j = 0; j < BLOCK_NUM; ++j) { 9586+ float cur_exponent = exponent[index + j]; 9587+ float cur_val = SIMD_F32_GETI(tmp_vec, j); 9588+ if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) { 9589+ int exp = abs((int)(cur_exponent)); 9590+ float result = 1; 9591+ while (exp) { 9592+ if (exp % 2) { 9593+ result *= cur_val; 9594+ } 9595+ cur_val *= cur_val; 9596+ exp = exp / 2; 9597+ } 9598+ output[index + j] = *exponent >= 0 ? result : 1 / result; 9599+ } else { 9600+ output[index + j] = powf(cur_val, cur_exponent); 9601+ } 9602+ } 9603+ } 9604+ return index; 9605+} 9606+ 9607+#undef MS_SIMD_INSTRUCTION 9608+#undef BLOCK_NUM 9609+ 9610+#undef MS_SIMD_NEON 9611+#ifdef __cplusplus 9612+} 9613+#endif 9614+#endif 9615diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h 9616new file mode 100644 9617index 00000000..7f9153f8 9618--- /dev/null 9619+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h 9620@@ -0,0 +1,180 @@ 9621+/** 9622+ * Copyright 2022 Huawei Technologies Co., Ltd 9623+ * 9624+ * Licensed under the Apache License, Version 2.0 (the "License"); 9625+ * you may not use this file except in compliance with the License. 9626+ * You may obtain a copy of the License at 9627+ * 9628+ * http://www.apache.org/licenses/LICENSE-2.0 9629+ * 9630+ * Unless required by applicable law or agreed to in writing, software 9631+ * distributed under the License is distributed on an "AS IS" BASIS, 9632+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9633+ * See the License for the specific language governing permissions and 9634+ * limitations under the License. 9635+ */ 9636+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_NEON_H_ 9637+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_NEON_H_ 9638+ 9639+#include "nnacl/intrinsics/ms_simd_instructions.h" 9640+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 9641+ 9642+#ifdef __cplusplus 9643+extern "C" { 9644+#endif 9645+ 9646+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 9647+#define BLOCK_NUM 4 9648+#define MS_SIMD_NEON 9649+ 9650+static inline int64_t ReduceSumNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 9651+ int axis_size) { 9652+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9653+ const float *inner_src = outer_src + index; 9654+ SIMD_F32 tmp = SIMD_MOV_F32(0); 9655+ for (int i = 0; i < axis_size; i++) { 9656+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 9657+ } 9658+ SIMD_ST_F32(outer_dst + index, tmp); 9659+ } 9660+ return index; 9661+} 9662+ 9663+static inline int64_t ReduceMeanNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 9664+ int axis_size) { 9665+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9666+ const float *inner_src = outer_src + index; 9667+ SIMD_F32 tmp = SIMD_MOV_F32(0); 9668+ for (int i = 0; i < axis_size; i++) { 9669+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 9670+ } 9671+ SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size)); 9672+ } 9673+ return index; 9674+} 9675+ 9676+static inline int64_t ReduceMinNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 9677+ int axis_size) { 9678+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9679+ const float *inner_src = outer_src + index; 9680+ SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX); 9681+ for (int i = 0; i < axis_size; i++) { 9682+ tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 9683+ } 9684+ SIMD_ST_F32(outer_dst + index, tmp); 9685+ } 9686+ return index; 9687+} 9688+ 9689+static inline int64_t ReduceMaxNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 9690+ int axis_size) { 9691+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9692+ const float *inner_src = outer_src + index; 9693+ SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN); 9694+ for (int i = 0; i < axis_size; i++) { 9695+ tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 9696+ } 9697+ SIMD_ST_F32(outer_dst + index, tmp); 9698+ } 9699+ return index; 9700+} 9701+ 9702+static inline int64_t ReduceProdNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 9703+ int axis_size) { 9704+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9705+ const float *inner_src = outer_src + index; 9706+ SIMD_F32 tmp = SIMD_MOV_F32(1.0f); 9707+ for (int i = 0; i < axis_size; i++) { 9708+ tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 9709+ } 9710+ SIMD_ST_F32(outer_dst + index, tmp); 9711+ } 9712+ return index; 9713+} 9714+ 9715+static inline int64_t ReduceSumSquareNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 9716+ int axis_size) { 9717+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9718+ const float *inner_src = outer_src + index; 9719+ SIMD_F32 tmp = SIMD_MOV_F32(0); 9720+ for (int i = 0; i < axis_size; i++) { 9721+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); 9722+ } 9723+ SIMD_ST_F32(outer_dst + index, tmp); 9724+ } 9725+ return index; 9726+} 9727+ 9728+static inline int64_t ReduceL2NormNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 9729+ int axis_size) { 9730+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9731+ const float *inner_src = outer_src + index; 9732+ SIMD_F32 tmp = SIMD_MOV_F32(0); 9733+ for (int i = 0; i < axis_size; i++) { 9734+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); 9735+ } 9736+ SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp)); 9737+ } 9738+ return index; 9739+} 9740+ 9741+static inline int64_t IntReduceSumNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 9742+ int axis_size) { 9743+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9744+ const int *inner_src = outer_src + index; 9745+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); 9746+ for (int i = 0; i < axis_size; i++) { 9747+ tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 9748+ } 9749+ SIMD_ST_EPI32(outer_dst + index, tmp); 9750+ } 9751+ return index; 9752+} 9753+ 9754+static inline int64_t IntReduceMeanNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 9755+ int axis_size) { 9756+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9757+ const int *inner_src = outer_src + index; 9758+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); 9759+ for (int i = 0; i < axis_size; i++) { 9760+ tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 9761+ } 9762+ SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size)); 9763+ } 9764+ return index; 9765+} 9766+ 9767+static inline int64_t IntReduceMinNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 9768+ int axis_size) { 9769+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9770+ const int *inner_src = outer_src + index; 9771+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX); 9772+ for (int i = 0; i < axis_size; i++) { 9773+ tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 9774+ } 9775+ SIMD_ST_EPI32(outer_dst + index, tmp); 9776+ } 9777+ return index; 9778+} 9779+ 9780+static inline int64_t IntReduceMaxNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 9781+ int axis_size) { 9782+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9783+ const int *inner_src = outer_src + index; 9784+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN); 9785+ for (int i = 0; i < axis_size; i++) { 9786+ tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 9787+ } 9788+ SIMD_ST_EPI32(outer_dst + index, tmp); 9789+ } 9790+ return index; 9791+} 9792+ 9793+#undef MS_SIMD_INSTRUCTION 9794+#undef BLOCK_NUM 9795+ 9796+#undef MS_SIMD_NEON 9797+#ifdef __cplusplus 9798+} 9799+#endif 9800+#endif 9801diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h 9802new file mode 100644 9803index 00000000..f116d92f 9804--- /dev/null 9805+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h 9806@@ -0,0 +1,86 @@ 9807+/** 9808+ * Copyright 2022 Huawei Technologies Co., Ltd 9809+ * 9810+ * Licensed under the Apache License, Version 2.0 (the "License"); 9811+ * you may not use this file except in compliance with the License. 9812+ * You may obtain a copy of the License at 9813+ * 9814+ * http://www.apache.org/licenses/LICENSE-2.0 9815+ * 9816+ * Unless required by applicable law or agreed to in writing, software 9817+ * distributed under the License is distributed on an "AS IS" BASIS, 9818+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9819+ * See the License for the specific language governing permissions and 9820+ * limitations under the License. 9821+ */ 9822+ 9823+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_NEON_H_ 9824+#define MINDSPORE_NNACL_FP32_SOFTMAX_NEON_H_ 9825+ 9826+#include "nnacl/intrinsics/ms_simd_instructions.h" 9827+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 9828+ 9829+#ifdef __cplusplus 9830+extern "C" { 9831+#endif 9832+ 9833+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 9834+#define BLOCK_NUM 4 9835+#define MS_SIMD_NEON 9836+ 9837+static inline int64_t SoftmaxNormGetMaxNEON(int64_t index, const float *src, int cur_batch_offset, 9838+ float *max, int channel) { 9839+ if (channel >= BLOCK_NUM * BLOCK_NUM) { 9840+ SIMD_F32 max_val = SIMD_MOV_F32(*max); 9841+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9842+ max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index)); 9843+ } 9844+ *max = SIMD_GET_MAX_F32(max_val); 9845+ } 9846+ return index; 9847+} 9848+ 9849+static inline int64_t SoftmaxNormCalcNormNEON(int64_t index, const float *src, float *dst, 9850+ int cur_batch_offset, float max, int channel) { 9851+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9852+ SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max)); 9853+ SIMD_ST_F32(dst + cur_batch_offset + index, output); 9854+ } 9855+ return index; 9856+} 9857+ 9858+static inline int64_t SoftmaxLastAxisGetExpSumNEON(int64_t index, const float *src, float *dst, 9859+ int cur_batch_offset, float max, float *exp_sum, int channel) { 9860+#ifndef _WIN32 9861+ SIMD_F32 sum_val = SIMD_SET0_F32; 9862+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9863+ SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); 9864+ SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max)); 9865+ SIMD_F32 exp_out = SIMD_EXP_F32(output); 9866+ sum_val = SIMD_ADD_F32(sum_val, exp_out); 9867+ SIMD_ST_F32(dst + cur_batch_offset + index, exp_out); 9868+ } 9869+ *exp_sum += SIMD_GET_SUM_F32(sum_val); 9870+#endif 9871+ return index; 9872+} 9873+ 9874+static inline int64_t SoftmaxLastAxisGetResultNEON(int64_t index, const float *src, float *dst, 9875+ int cur_batch_offset, float exp_sum, int channel) { 9876+ SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum); 9877+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9878+ SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); 9879+ SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val); 9880+ SIMD_ST_F32(dst + cur_batch_offset + index, output); 9881+ } 9882+ return index; 9883+} 9884+ 9885+#undef MS_SIMD_INSTRUCTION 9886+#undef BLOCK_NUM 9887+ 9888+#undef MS_SIMD_NEON 9889+#ifdef __cplusplus 9890+}; 9891+#endif 9892+#endif 9893diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h 9894new file mode 100644 9895index 00000000..d2731101 9896--- /dev/null 9897+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h 9898@@ -0,0 +1,166 @@ 9899+/** 9900+ * Copyright 2022 Huawei Technologies Co., Ltd 9901+ * 9902+ * Licensed under the Apache License, Version 2.0 (the "License"); 9903+ * you may not use this file except in compliance with the License. 9904+ * You may obtain a copy of the License at 9905+ * 9906+ * http://www.apache.org/licenses/LICENSE-2.0 9907+ * 9908+ * Unless required by applicable law or agreed to in writing, software 9909+ * distributed under the License is distributed on an "AS IS" BASIS, 9910+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 9911+ * See the License for the specific language governing permissions and 9912+ * limitations under the License. 9913+ */ 9914+ 9915+#ifndef MINDSPORE_NNACL_FP32_SUB_NEON_H_ 9916+#define MINDSPORE_NNACL_FP32_SUB_NEON_H_ 9917+ 9918+#include "nnacl/intrinsics/ms_simd_instructions.h" 9919+#include "nnacl/intrinsics/ms_simd_neon_instructions.h" 9920+ 9921+#ifdef __cplusplus 9922+extern "C" { 9923+#endif 9924+ 9925+#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION 9926+#define BLOCK_NUM 4 9927+#define MS_SIMD_NEON 9928+ 9929+static inline int ElementOptSubNum0NEON(int index, const float *in0, const float *in1, float *out, 9930+ int size) { 9931+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 9932+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9933+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 9934+ SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1); 9935+ SIMD_ST_F32(out + index, vout); 9936+ } 9937+ return index; 9938+} 9939+ 9940+static inline int ElementOptSubNum1NEON(int index, const float *in0, const float *in1, float *out, 9941+ int size) { 9942+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 9943+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9944+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 9945+ SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_); 9946+ SIMD_ST_F32(out + index, vout); 9947+ } 9948+ return index; 9949+} 9950+ 9951+static inline int ElementOptSubIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { 9952+ SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); 9953+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9954+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 9955+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1); 9956+ SIMD_ST_EPI32(out + index, vout); 9957+ } 9958+ return index; 9959+} 9960+ 9961+static inline int ElementOptSubIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { 9962+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 9963+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9964+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 9965+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_); 9966+ SIMD_ST_EPI32(out + index, vout); 9967+ } 9968+ return index; 9969+} 9970+ 9971+static inline int ElementOptSubReluNum0NEON(int index, const float *in0, const float *in1, float *out, 9972+ int size) { 9973+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 9974+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9975+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 9976+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f); 9977+ SIMD_ST_F32(out + index, vout); 9978+ } 9979+ return index; 9980+} 9981+ 9982+static inline int ElementOptSubReluNum1NEON(int index, const float *in0, const float *in1, float *out, 9983+ int size) { 9984+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 9985+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9986+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 9987+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f); 9988+ SIMD_ST_F32(out + index, vout); 9989+ } 9990+ return index; 9991+} 9992+ 9993+static inline int ElementOptSubRelu6Num0NEON(int index, const float *in0, const float *in1, float *out, 9994+ int size) { 9995+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 9996+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 9997+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 9998+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f); 9999+ SIMD_ST_F32(out + index, vout); 10000+ } 10001+ return index; 10002+} 10003+ 10004+static inline int ElementOptSubRelu6Num1NEON(int index, const float *in0, const float *in1, float *out, 10005+ int size) { 10006+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 10007+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10008+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 10009+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f); 10010+ SIMD_ST_F32(out + index, vout); 10011+ } 10012+ return index; 10013+} 10014+ 10015+static inline int ElementSubNEON(int index, const float *in0, const float *in1, float *out, int size) { 10016+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10017+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 10018+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 10019+ SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1); 10020+ SIMD_ST_F32(out + index, vout); 10021+ } 10022+ return index; 10023+} 10024+ 10025+static inline int ElementSubIntNEON(int index, const int *in0, const int *in1, int *out, int size) { 10026+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10027+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 10028+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 10029+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1); 10030+ SIMD_ST_EPI32(out + index, vout); 10031+ } 10032+ return index; 10033+} 10034+ 10035+static inline int ElementSubReluNEON(int index, const float *in0, const float *in1, float *out, 10036+ int size) { 10037+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10038+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 10039+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 10040+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f); 10041+ SIMD_ST_F32(out + index, vout); 10042+ } 10043+ return index; 10044+} 10045+ 10046+static inline int ElementSubRelu6NEON(int index, const float *in0, const float *in1, float *out, 10047+ int size) { 10048+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10049+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 10050+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 10051+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f); 10052+ SIMD_ST_F32(out + index, vout); 10053+ } 10054+ return index; 10055+} 10056+ 10057+#undef MS_SIMD_INSTRUCTION 10058+#undef BLOCK_NUM 10059+ 10060+#undef MS_SIMD_NEON 10061+#ifdef __cplusplus 10062+}; 10063+#endif 10064+#endif 10065diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h 10066new file mode 100644 10067index 00000000..75bda800 10068--- /dev/null 10069+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h 10070@@ -0,0 +1,36 @@ 10071+/** 10072+ * Copyright 2022 Huawei Technologies Co., Ltd 10073+ * 10074+ * Licensed under the Apache License, Version 2.0 (the "License"); 10075+ * you may not use this file except in compliance with the License. 10076+ * You may obtain a copy of the License at 10077+ * 10078+ * http://www.apache.org/licenses/LICENSE-2.0 10079+ * 10080+ * Unless required by applicable law or agreed to in writing, software 10081+ * distributed under the License is distributed on an "AS IS" BASIS, 10082+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10083+ * See the License for the specific language governing permissions and 10084+ * limitations under the License. 10085+ */ 10086+#ifndef MINDSPORE_NNACL_POOLING_FP32_SIMD_H_ 10087+#define MINDSPORE_NNACL_POOLING_FP32_SIMD_H_ 10088+ 10089+#include "nnacl/intrinsics/ms_simd_instructions.h" 10090+#ifdef ENABLE_AVX512 10091+#include "nnacl/avx512/pooling_fp32_avx512.h" 10092+#endif 10093+ 10094+#ifdef ENABLE_AVX 10095+#include "nnacl/avx/pooling_fp32_avx.h" 10096+#endif 10097+ 10098+#ifdef ENABLE_SSE 10099+#include "nnacl/sse/pooling_fp32_sse.h" 10100+#endif 10101+ 10102+#ifdef ENABLE_ARM 10103+#include "nnacl/neon/pooling_fp32_neon.h" 10104+#endif 10105+ 10106+#endif 10107diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h 10108new file mode 100644 10109index 00000000..15e9f009 10110--- /dev/null 10111+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h 10112@@ -0,0 +1,36 @@ 10113+/** 10114+ * Copyright 2022 Huawei Technologies Co., Ltd 10115+ * 10116+ * Licensed under the Apache License, Version 2.0 (the "License"); 10117+ * you may not use this file except in compliance with the License. 10118+ * You may obtain a copy of the License at 10119+ * 10120+ * http://www.apache.org/licenses/LICENSE-2.0 10121+ * 10122+ * Unless required by applicable law or agreed to in writing, software 10123+ * distributed under the License is distributed on an "AS IS" BASIS, 10124+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10125+ * See the License for the specific language governing permissions and 10126+ * limitations under the License. 10127+ */ 10128+#ifndef MINDSPORE_NNACL_POWER_FP32_SIMD_H_ 10129+#define MINDSPORE_NNACL_POWER_FP32_SIMD_H_ 10130+ 10131+#include "nnacl/intrinsics/ms_simd_instructions.h" 10132+#ifdef ENABLE_AVX512 10133+#include "nnacl/avx512/power_fp32_avx512.h" 10134+#endif 10135+ 10136+#ifdef ENABLE_AVX 10137+#include "nnacl/avx/power_fp32_avx.h" 10138+#endif 10139+ 10140+#ifdef ENABLE_SSE 10141+#include "nnacl/sse/power_fp32_sse.h" 10142+#endif 10143+ 10144+#ifdef ENABLE_ARM 10145+#include "nnacl/neon/power_fp32_neon.h" 10146+#endif 10147+ 10148+#endif 10149diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h 10150new file mode 100644 10151index 00000000..60d0cd85 10152--- /dev/null 10153+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h 10154@@ -0,0 +1,36 @@ 10155+/** 10156+ * Copyright 2022 Huawei Technologies Co., Ltd 10157+ * 10158+ * Licensed under the Apache License, Version 2.0 (the "License"); 10159+ * you may not use this file except in compliance with the License. 10160+ * You may obtain a copy of the License at 10161+ * 10162+ * http://www.apache.org/licenses/LICENSE-2.0 10163+ * 10164+ * Unless required by applicable law or agreed to in writing, software 10165+ * distributed under the License is distributed on an "AS IS" BASIS, 10166+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10167+ * See the License for the specific language governing permissions and 10168+ * limitations under the License. 10169+ */ 10170+#ifndef MINDSPORE_NNACL_REDUCE_FP32_SIMD_H_ 10171+#define MINDSPORE_NNACL_REDUCE_FP32_SIMD_H_ 10172+ 10173+#include "nnacl/intrinsics/ms_simd_instructions.h" 10174+#ifdef ENABLE_AVX512 10175+#include "nnacl/avx512/reduce_fp32_avx512.h" 10176+#endif 10177+ 10178+#ifdef ENABLE_AVX 10179+#include "nnacl/avx/reduce_fp32_avx.h" 10180+#endif 10181+ 10182+#ifdef ENABLE_SSE 10183+#include "nnacl/sse/reduce_fp32_sse.h" 10184+#endif 10185+ 10186+#ifdef ENABLE_ARM 10187+#include "nnacl/neon/reduce_fp32_neon.h" 10188+#endif 10189+ 10190+#endif 10191diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h 10192new file mode 100644 10193index 00000000..524668ab 10194--- /dev/null 10195+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h 10196@@ -0,0 +1,36 @@ 10197+/** 10198+ * Copyright 2022 Huawei Technologies Co., Ltd 10199+ * 10200+ * Licensed under the Apache License, Version 2.0 (the "License"); 10201+ * you may not use this file except in compliance with the License. 10202+ * You may obtain a copy of the License at 10203+ * 10204+ * http://www.apache.org/licenses/LICENSE-2.0 10205+ * 10206+ * Unless required by applicable law or agreed to in writing, software 10207+ * distributed under the License is distributed on an "AS IS" BASIS, 10208+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10209+ * See the License for the specific language governing permissions and 10210+ * limitations under the License. 10211+ */ 10212+#ifndef MINDSPORE_NNACL_SOFTMAX_FP32_SIMD_H_ 10213+#define MINDSPORE_NNACL_SOFTMAX_FP32_SIMD_H_ 10214+ 10215+#include "nnacl/intrinsics/ms_simd_instructions.h" 10216+#ifdef ENABLE_AVX512 10217+#include "nnacl/avx512/softmax_fp32_avx512.h" 10218+#endif 10219+ 10220+#ifdef ENABLE_AVX 10221+#include "nnacl/avx/softmax_fp32_avx.h" 10222+#endif 10223+ 10224+#ifdef ENABLE_SSE 10225+#include "nnacl/sse/softmax_fp32_sse.h" 10226+#endif 10227+ 10228+#ifdef ENABLE_ARM 10229+#include "nnacl/neon/softmax_fp32_neon.h" 10230+#endif 10231+ 10232+#endif 10233diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h 10234new file mode 100644 10235index 00000000..192fc66d 10236--- /dev/null 10237+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h 10238@@ -0,0 +1,221 @@ 10239+/** 10240+ * Copyright 2022 Huawei Technologies Co., Ltd 10241+ * 10242+ * Licensed under the Apache License, Version 2.0 (the "License"); 10243+ * you may not use this file except in compliance with the License. 10244+ * You may obtain a copy of the License at 10245+ * 10246+ * http://www.apache.org/licenses/LICENSE-2.0 10247+ * 10248+ * Unless required by applicable law or agreed to in writing, software 10249+ * distributed under the License is distributed on an "AS IS" BASIS, 10250+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10251+ * See the License for the specific language governing permissions and 10252+ * limitations under the License. 10253+ */ 10254+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ 10255+#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ 10256+ 10257+#include "nnacl/intrinsics/ms_simd_instructions.h" 10258+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 10259+ 10260+#ifdef __cplusplus 10261+extern "C" { 10262+#endif 10263+#pragma GCC push_options 10264+#pragma GCC target("sse4.1") 10265+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 10266+#define BLOCK_NUM 4 10267+#define MS_SIMD_SSE 10268+ 10269+static inline int Fp32ReluSSE(int index, const float *src, int length, float *dst) { 10270+ SIMD_F32 zero = SIMD_SET0_F32; 10271+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10272+ SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero)); 10273+ } 10274+ return index; 10275+} 10276+ 10277+static inline int Int32ReluSSE(int index, const int32_t *src, int length, int32_t *dst) { 10278+ SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f); 10279+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10280+ SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero)); 10281+ } 10282+ return index; 10283+} 10284+ 10285+static inline int Fp32Relu6SSE(int index, const float *src, int length, float *dst) { 10286+ SIMD_F32 zero = SIMD_SET0_F32; 10287+ SIMD_F32 six = SIMD_MOV_F32(6.0f); 10288+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10289+ SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six)); 10290+ } 10291+ return index; 10292+} 10293+ 10294+static inline int LReluSSE(int index, const float *src, int length, float *dst, float alpha) { 10295+ SIMD_F32 alpha_data = SIMD_MOV_F32(alpha); 10296+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10297+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 10298+ SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp); 10299+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask)); 10300+ } 10301+ return index; 10302+} 10303+ 10304+static inline int SigmoidSSE(int index, const float *src, int length, float *dst) { 10305+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10306+ SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index); 10307+ SIMD_ST_F32(dst + index, 10308+ SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); 10309+ } 10310+ return index; 10311+} 10312+ 10313+static inline int TanhSSE(int index, const float *src, int length, float *dst) { 10314+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10315+ SIMD_F32 input = SIMD_LD_F32(src + index); 10316+ SIMD_ST_F32(dst + index, SIMD_TANH_F32(input)); 10317+ } 10318+ return index; 10319+} 10320+ 10321+static inline int SwishSSE(int index, const float *src, int length, float *dst) { 10322+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10323+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 10324+ SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index); 10325+ SIMD_ST_F32(dst + index, 10326+ SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); 10327+ } 10328+ return index; 10329+} 10330+ 10331+static inline int HSwishSSE(int index, const float *src, int length, float *dst) { 10332+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10333+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 10334+ SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); 10335+ SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6)); 10336+ } 10337+ return index; 10338+} 10339+ 10340+static inline int HSigmoidSSE(int index, const float *src, int length, float *dst) { 10341+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10342+ SIMD_F32 src_value = SIMD_LD_F32(src + index); 10343+ SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); 10344+ SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6)); 10345+ } 10346+ return index; 10347+} 10348+ 10349+static inline int HardTanhNoLimitMinSSE(int index, const float *src, int length, float *dst, float min_val, 10350+ float max_val) { 10351+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10352+ SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val)); 10353+ } 10354+ return index; 10355+} 10356+ 10357+static inline int HardTanhNoLimitMaxSSE(int index, const float *src, int length, float *dst, float min_val, 10358+ float max_val) { 10359+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10360+ SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val)); 10361+ } 10362+ return index; 10363+} 10364+ 10365+static inline int HardTanhLimitMinMaxSSE(int index, const float *src, int length, float *dst, float min_val, 10366+ float max_val) { 10367+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10368+ SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val)); 10369+ } 10370+ return index; 10371+} 10372+ 10373+static inline int GeluApproximateSSE(int index, const float *src, int length, float *dst) { 10374+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10375+ SIMD_F32 in = SIMD_LD_F32(src + index); 10376+ SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in); 10377+ SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in); 10378+ SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f))); 10379+ } 10380+ return index; 10381+} 10382+ 10383+static inline int GeluSSE(int index, const float *src, int length, float *dst) { 10384+ SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f); 10385+ SIMD_F32 para2 = SIMD_MOV_F32(1.0f); 10386+ SIMD_F32 para3 = SIMD_MOV_F32(0.5f); 10387+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10388+ SIMD_F32 in = SIMD_LD_F32(src + index); 10389+ SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1)))); 10390+ SIMD_ST_F32(dst + index, res); 10391+ } 10392+ return index; 10393+} 10394+ 10395+static inline int EluSSE(int index, const float *src, int length, float *dst, float alpha) { 10396+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10397+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 10398+ SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f); 10399+ SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); 10400+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); 10401+ } 10402+ return index; 10403+} 10404+ 10405+static inline int CeluSSE(int index, const float *src, int length, float *dst, float alpha) { 10406+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10407+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 10408+ SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f); 10409+ SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); 10410+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); 10411+ } 10412+ return index; 10413+} 10414+ 10415+static inline int HShrinkSSE(int index, const float *src, int length, float *dst, float lambd) { 10416+ const float neg_lambd = -1 * lambd; 10417+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10418+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 10419+ SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd)); 10420+ SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp); 10421+ SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); 10422+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask)); 10423+ } 10424+ return index; 10425+} 10426+ 10427+static inline int SoftShrinkSSE(int index, const float *src, int length, float *dst, float lambd) { 10428+ SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); 10429+ SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); 10430+ 10431+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10432+ SIMD_F32 src_t = SIMD_LD_F32(src + index); 10433+ /* v0 = (in > lamdb) & (in - lamdb) */ 10434+ SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v)); 10435+ /* v1 = (in < -lamdb) & (in + lamdb) */ 10436+ SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v)); 10437+ /* out = (v0 | v1) */ 10438+ SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1)); 10439+ } 10440+ return index; 10441+} 10442+ 10443+static inline int SoftsignFp32OptSSE(int index, const float *src, int length, float *dst) { 10444+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10445+ SIMD_F32 src_tmp = SIMD_LD_F32(src + index); 10446+ SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp)); 10447+ SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp)); 10448+ } 10449+ return index; 10450+} 10451+ 10452+#undef MS_SIMD_INSTRUCTION 10453+#undef BLOCK_NUM 10454+#pragma GCC pop_options 10455+#undef MS_SIMD_SSE 10456+#ifdef __cplusplus 10457+} 10458+#endif 10459+#endif 10460diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h 10461new file mode 100644 10462index 00000000..85996f69 10463--- /dev/null 10464+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h 10465@@ -0,0 +1,57 @@ 10466+/** 10467+ * Copyright 2022 Huawei Technologies Co., Ltd 10468+ * 10469+ * Licensed under the Apache License, Version 2.0 (the "License"); 10470+ * you may not use this file except in compliance with the License. 10471+ * You may obtain a copy of the License at 10472+ * 10473+ * http://www.apache.org/licenses/LICENSE-2.0 10474+ * 10475+ * Unless required by applicable law or agreed to in writing, software 10476+ * distributed under the License is distributed on an "AS IS" BASIS, 10477+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10478+ * See the License for the specific language governing permissions and 10479+ * limitations under the License. 10480+ */ 10481+ 10482+#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_SSE_H_ 10483+#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_SSE_H_ 10484+ 10485+#include "nnacl/intrinsics/ms_simd_instructions.h" 10486+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 10487+ 10488+#ifdef __cplusplus 10489+extern "C" { 10490+#endif 10491+#pragma GCC push_options 10492+#pragma GCC target("sse4.1") 10493+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 10494+#define BLOCK_NUM 4 10495+#define MS_SIMD_SSE 10496+ 10497+static inline int ShrinkGradSSE(int index, const float *src0, const float *src1, 10498+ int length, float *dst, float lambd) { 10499+ SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); 10500+ SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); 10501+ 10502+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10503+ SIMD_F32 src0_t = SIMD_LD_F32(src0 + index); 10504+ SIMD_F32 src1_t = SIMD_LD_F32(src1 + index); 10505+ 10506+ SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v); 10507+ SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t); 10508+ SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); 10509+ 10510+ SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask)); 10511+ } 10512+ return index; 10513+} 10514+ 10515+#undef MS_SIMD_INSTRUCTION 10516+#undef BLOCK_NUM 10517+#pragma GCC pop_options 10518+#undef MS_SIMD_SSE 10519+#ifdef __cplusplus 10520+} 10521+#endif 10522+#endif 10523diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h 10524new file mode 100644 10525index 00000000..1f5291a4 10526--- /dev/null 10527+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h 10528@@ -0,0 +1,210 @@ 10529+/** 10530+ * Copyright 2022 Huawei Technologies Co., Ltd 10531+ * 10532+ * Licensed under the Apache License, Version 2.0 (the "License"); 10533+ * you may not use this file except in compliance with the License. 10534+ * You may obtain a copy of the License at 10535+ * 10536+ * http://www.apache.org/licenses/LICENSE-2.0 10537+ * 10538+ * Unless required by applicable law or agreed to in writing, software 10539+ * distributed under the License is distributed on an "AS IS" BASIS, 10540+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10541+ * See the License for the specific language governing permissions and 10542+ * limitations under the License. 10543+ */ 10544+#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_SSE_H_ 10545+#define MINDSPORE_NNACL_FP32_ADAM_FP32_SSE_H_ 10546+ 10547+#include "nnacl/intrinsics/ms_simd_instructions.h" 10548+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 10549+ 10550+#ifdef __cplusplus 10551+extern "C" { 10552+#endif 10553+#pragma GCC push_options 10554+#pragma GCC target("sse4.1") 10555+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 10556+#define BLOCK_NUM 4 10557+#define MS_SIMD_SSE 10558+#ifdef MS_SIMD_AVX512 10559+ static inline size_t AdamWeightDecayFp32SSE(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 10560+ const float *gradient, size_t end) { 10561+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 10562+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 10563+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 10564+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 10565+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 10566+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 10567+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 10568+ 10569+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10570+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 10571+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 10572+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 10573+ SIMD_F32 g_r = SIMD_LD_F32(gradient + index); 10574+ 10575+ m_r = SIMD_MUL_F32(m_r, beta1_r); 10576+ v_r = SIMD_MUL_F32(v_r, beta2_r); 10577+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 10578+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 10579+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 10580+ avx_r0 = SIMD_SQRT_F32(v_r); 10581+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 10582+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 10583+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 10584+ SIMD_ST_F32(m + index, m_r); 10585+ SIMD_ST_F32(v + index, v_r); 10586+ SIMD_ST_F32(var + index, var_r); 10587+ } 10588+ 10589+ return index; 10590+} 10591+ 10592+static inline size_t FusedCastAdamFp32Fp16SSE(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 10593+ float global_norm_reciprocal, size_t end) { 10594+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 10595+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 10596+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 10597+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 10598+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 10599+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 10600+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 10601+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 10602+ 10603+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10604+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 10605+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 10606+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 10607+ SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); 10608+ 10609+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 10610+ m_r = SIMD_MUL_F32(m_r, beta1_r); 10611+ v_r = SIMD_MUL_F32(v_r, beta2_r); 10612+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 10613+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 10614+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 10615+ avx_r0 = SIMD_SQRT_F32(v_r); 10616+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 10617+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 10618+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 10619+ SIMD_ST_F32(var + index, var_r); 10620+ SIMD_ST_F32(m + index, m_r); 10621+ SIMD_ST_F32(v + index, v_r); 10622+ } 10623+ 10624+ return index; 10625+} 10626+ 10627+static inline size_t FusedCastAdamFp32Fp32SSE(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 10628+ float global_norm_reciprocal, size_t end) { 10629+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 10630+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 10631+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 10632+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 10633+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 10634+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 10635+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 10636+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 10637+ 10638+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10639+ SIMD_F32 var_r = SIMD_LD_F32(var + index); 10640+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 10641+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 10642+ SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); 10643+ 10644+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 10645+ m_r = SIMD_MUL_F32(m_r, beta1_r); 10646+ v_r = SIMD_MUL_F32(v_r, beta2_r); 10647+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 10648+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 10649+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 10650+ avx_r0 = SIMD_SQRT_F32(v_r); 10651+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 10652+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 10653+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 10654+ SIMD_ST_F32(var + index, var_r); 10655+ SIMD_ST_F32(m + index, m_r); 10656+ SIMD_ST_F32(v + index, v_r); 10657+ } 10658+ 10659+ return index; 10660+} 10661+ 10662+static inline size_t FusedCastAdamFp16Fp16SSE(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 10663+ float global_norm_reciprocal, size_t end) { 10664+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 10665+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 10666+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 10667+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 10668+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 10669+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 10670+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 10671+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 10672+ 10673+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10674+ SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); 10675+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 10676+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 10677+ SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); 10678+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 10679+ m_r = SIMD_MUL_F32(m_r, beta1_r); 10680+ v_r = SIMD_MUL_F32(v_r, beta2_r); 10681+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 10682+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 10683+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 10684+ avx_r0 = SIMD_SQRT_F32(v_r); 10685+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 10686+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 10687+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 10688+ SIMD_ST_F32(m + index, m_r); 10689+ SIMD_ST_F32(v + index, v_r); 10690+ SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); 10691+ } 10692+ 10693+ return index; 10694+} 10695+ 10696+static inline size_t FusedCastAdamFp16Fp32SSE(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, 10697+ float global_norm_reciprocal, size_t end) { 10698+ SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); 10699+ SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); 10700+ SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); 10701+ SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); 10702+ SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); 10703+ SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); 10704+ SIMD_F32 decay_r = SIMD_MOV_F32(decay); 10705+ SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); 10706+ 10707+ for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10708+ SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); 10709+ SIMD_F32 m_r = SIMD_LD_F32(m + index); 10710+ SIMD_F32 v_r = SIMD_LD_F32(v + index); 10711+ SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); 10712+ g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); 10713+ m_r = SIMD_MUL_F32(m_r, beta1_r); 10714+ v_r = SIMD_MUL_F32(v_r, beta2_r); 10715+ SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); 10716+ m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); 10717+ v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); 10718+ avx_r0 = SIMD_SQRT_F32(v_r); 10719+ avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); 10720+ avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); 10721+ var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); 10722+ SIMD_ST_F32(m + index, m_r); 10723+ SIMD_ST_F32(v + index, v_r); 10724+ SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); 10725+ } 10726+ 10727+ return index; 10728+} 10729+#endif 10730+ 10731+#undef MS_SIMD_INSTRUCTION 10732+#undef BLOCK_NUM 10733+#pragma GCC pop_options 10734+#undef MS_SIMD_SSE 10735+#ifdef __cplusplus 10736+} 10737+#endif 10738+#endif 10739diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h 10740new file mode 100644 10741index 00000000..eb705534 10742--- /dev/null 10743+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h 10744@@ -0,0 +1,124 @@ 10745+/** 10746+ * Copyright 2022 Huawei Technologies Co., Ltd 10747+ * 10748+ * Licensed under the Apache License, Version 2.0 (the "License"); 10749+ * you may not use this file except in compliance with the License. 10750+ * You may obtain a copy of the License at 10751+ * 10752+ * http://www.apache.org/licenses/LICENSE-2.0 10753+ * 10754+ * Unless required by applicable law or agreed to in writing, software 10755+ * distributed under the License is distributed on an "AS IS" BASIS, 10756+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10757+ * See the License for the specific language governing permissions and 10758+ * limitations under the License. 10759+ */ 10760+ 10761+#ifndef MINDSPORE_NNACL_FP32_ADD_SSE_H_ 10762+#define MINDSPORE_NNACL_FP32_ADD_SSE_H_ 10763+ 10764+#include "nnacl/intrinsics/ms_simd_instructions.h" 10765+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 10766+ 10767+#ifdef __cplusplus 10768+extern "C" { 10769+#endif 10770+#pragma GCC push_options 10771+#pragma GCC target("sse4.1") 10772+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 10773+#define BLOCK_NUM 4 10774+#define MS_SIMD_SSE 10775+ 10776+static inline int ElementOptAddSSE(int index, const float *in0, const float *in1, float *out, int size) { 10777+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 10778+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10779+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 10780+ SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1); 10781+ SIMD_ST_F32(out + index, vout); 10782+ } 10783+ return index; 10784+} 10785+ 10786+static inline int ElementOptAddIntSSE(int index, const int *in0, const int *in1, int *out, 10787+ int size) { 10788+ SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]); 10789+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10790+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 10791+ SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1); 10792+ SIMD_ST_EPI32(out + index, vout); 10793+ } 10794+ return index; 10795+} 10796+ 10797+static inline int ElementOptAddReluSSE(int index, const float *in0, const float *in1, float *out, 10798+ int size) { 10799+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 10800+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10801+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 10802+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f); 10803+ SIMD_ST_F32(out + index, vout); 10804+ } 10805+ return index; 10806+} 10807+ 10808+static inline int ElementOptAddRelu6SSE(int index, const float *in0, const float *in1, float *out, 10809+ int size) { 10810+ SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); 10811+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10812+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 10813+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f); 10814+ SIMD_ST_F32(out + index, vout); 10815+ } 10816+ return index; 10817+} 10818+ 10819+static inline int ElementAddSSE(int index, const float *in0, const float *in1, float *out, int size) { 10820+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10821+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 10822+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 10823+ SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); 10824+ SIMD_ST_F32(out + index, vout); 10825+ } 10826+ return index; 10827+} 10828+ 10829+static inline int ElementAddReluSSE(int index, const float *in0, const float *in1, float *out, 10830+ int size) { 10831+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10832+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 10833+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 10834+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f); 10835+ SIMD_ST_F32(out + index, vout); 10836+ } 10837+ return index; 10838+} 10839+ 10840+static inline int ElementAddRelu6SSE(int index, const float *in0, const float *in1, float *out, 10841+ int size) { 10842+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10843+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 10844+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 10845+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f); 10846+ SIMD_ST_F32(out + index, vout); 10847+ } 10848+ return index; 10849+} 10850+ 10851+static inline int ElementAddIntSSE(int index, const int *in0, const int *in1, int *out, int size) { 10852+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10853+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 10854+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 10855+ SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1); 10856+ SIMD_ST_EPI32(out + index, vout); 10857+ } 10858+ return index; 10859+} 10860+ 10861+#undef MS_SIMD_INSTRUCTION 10862+#undef BLOCK_NUM 10863+#pragma GCC pop_options 10864+#undef MS_SIMD_SSE 10865+#ifdef __cplusplus 10866+} 10867+#endif 10868+#endif 10869diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h 10870new file mode 100644 10871index 00000000..173890b4 10872--- /dev/null 10873+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h 10874@@ -0,0 +1,254 @@ 10875+/** 10876+ * Copyright 2022 Huawei Technologies Co., Ltd 10877+ * 10878+ * Licensed under the Apache License, Version 2.0 (the "License"); 10879+ * you may not use this file except in compliance with the License. 10880+ * You may obtain a copy of the License at 10881+ * 10882+ * http://www.apache.org/licenses/LICENSE-2.0 10883+ * 10884+ * Unless required by applicable law or agreed to in writing, software 10885+ * distributed under the License is distributed on an "AS IS" BASIS, 10886+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10887+ * See the License for the specific language governing permissions and 10888+ * limitations under the License. 10889+ */ 10890+ 10891+#ifndef MINDSPORE_NNACL_ARITHMETIC_SSE_H_ 10892+#define MINDSPORE_NNACL_ARITHMETIC_SSE_H_ 10893+ 10894+#include "nnacl/intrinsics/ms_simd_instructions.h" 10895+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 10896+ 10897+#ifdef __cplusplus 10898+extern "C" { 10899+#endif 10900+#pragma GCC push_options 10901+#pragma GCC target("sse4.1") 10902+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 10903+#define BLOCK_NUM 4 10904+#define MS_SIMD_SSE 10905+ 10906+#ifndef MS_SIMD_NEON 10907+static inline int ElementFloorModSSE(int index, const float *in0, const float *in1, float *out, int size) { 10908+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10909+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 10910+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 10911+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 10912+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 10913+ SIMD_ST_F32(out + index, out_tmp); 10914+ } 10915+ return index; 10916+} 10917+ 10918+static inline int ElementOptFloorModNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { 10919+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 10920+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10921+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 10922+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 10923+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 10924+ SIMD_ST_F32(out + index, out_tmp); 10925+ } 10926+ return index; 10927+} 10928+ 10929+static inline int ElementOptFloorModNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { 10930+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 10931+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10932+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 10933+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 10934+ SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); 10935+ SIMD_ST_F32(out + index, out_tmp); 10936+ } 10937+ return index; 10938+} 10939+ 10940+static inline int ElementFloorDivSSE(int index, const float *in0, const float *in1, float *out, int size) { 10941+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10942+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 10943+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 10944+ SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 10945+ SIMD_ST_F32(out + index, floor_tmp); 10946+ } 10947+ return index; 10948+} 10949+ 10950+static inline int ElementOptFloorDivNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { 10951+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 10952+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10953+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 10954+ SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 10955+ SIMD_ST_F32(out + index, out_tmp); 10956+ } 10957+ return index; 10958+} 10959+ 10960+static inline int ElementOptFloorDivNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { 10961+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 10962+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10963+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 10964+ SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); 10965+ SIMD_ST_F32(out + index, out_tmp); 10966+ } 10967+ return index; 10968+} 10969+#endif 10970+ 10971+static inline int ElementFloorDivIntSSE(int index, const int *in0, const int *in1, int *out, int size) { 10972+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10973+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 10974+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 10975+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 10976+ SIMD_ST_EPI32(out + index, out_tmp); 10977+ } 10978+ return index; 10979+} 10980+ 10981+static inline int ElementOptFloorDivIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { 10982+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 10983+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10984+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 10985+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 10986+ SIMD_ST_EPI32(out + index, out_tmp); 10987+ } 10988+ return index; 10989+} 10990+ 10991+static inline int ElementOptFloorDivIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { 10992+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 10993+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 10994+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 10995+ SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); 10996+ SIMD_ST_EPI32(out + index, out_tmp); 10997+ } 10998+ return index; 10999+} 11000+ 11001+static inline int ElementMaximumSSE(int index, const float *in0, const float *in1, float *out, int size) { 11002+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11003+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 11004+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 11005+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 11006+ SIMD_ST_F32(out + index, out_tmp); 11007+ } 11008+ return index; 11009+} 11010+ 11011+static inline int ElementOptMaximumNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { 11012+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 11013+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11014+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 11015+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 11016+ SIMD_ST_F32(out + index, out_tmp); 11017+ } 11018+ return index; 11019+} 11020+ 11021+static inline int ElementOptMaximumNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { 11022+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 11023+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11024+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 11025+ SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); 11026+ SIMD_ST_F32(out + index, out_tmp); 11027+ } 11028+ return index; 11029+} 11030+ 11031+static inline int ElementMaximumIntSSE(int index, const int *in0, const int *in1, int *out, int size) { 11032+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11033+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 11034+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 11035+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 11036+ SIMD_ST_EPI32(out + index, out_tmp); 11037+ } 11038+ return index; 11039+} 11040+ 11041+static inline int ElementOptMaximumIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { 11042+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 11043+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11044+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 11045+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 11046+ SIMD_ST_EPI32(out + index, out_tmp); 11047+ } 11048+ return index; 11049+} 11050+ 11051+static inline int ElementOptMaximumIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { 11052+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 11053+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11054+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 11055+ SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); 11056+ SIMD_ST_EPI32(out + index, out_tmp); 11057+ } 11058+ return index; 11059+} 11060+ 11061+static inline int ElementMinimumIntSSE(int index, const int *in0, const int *in1, int *out, int size) { 11062+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11063+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 11064+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 11065+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 11066+ SIMD_ST_EPI32(out + index, out_tmp); 11067+ } 11068+ return index; 11069+} 11070+ 11071+static inline int ElementOptMinimumIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { 11072+ SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); 11073+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11074+ SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); 11075+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 11076+ SIMD_ST_EPI32(out + index, out_tmp); 11077+ } 11078+ return index; 11079+} 11080+ 11081+static inline int ElementOptMinimumIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { 11082+ SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); 11083+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11084+ SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); 11085+ SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); 11086+ SIMD_ST_EPI32(out + index, out_tmp); 11087+ } 11088+ return index; 11089+} 11090+ 11091+static inline int ElementMinimumSSE(int index, const float *in0, const float *in1, float *out, int size) { 11092+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11093+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 11094+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 11095+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 11096+ SIMD_ST_F32(out + index, out_tmp); 11097+ } 11098+ return index; 11099+} 11100+ 11101+static inline int ElementOptMinimumNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { 11102+ SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); 11103+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11104+ SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); 11105+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 11106+ SIMD_ST_F32(out + index, out_tmp); 11107+ } 11108+ return index; 11109+} 11110+ 11111+static inline int ElementOptMinimumNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { 11112+ SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); 11113+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11114+ SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); 11115+ SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); 11116+ SIMD_ST_F32(out + index, out_tmp); 11117+ } 11118+ return index; 11119+} 11120+ 11121+#undef MS_SIMD_INSTRUCTION 11122+#undef BLOCK_NUM 11123+#pragma GCC pop_options 11124+#undef MS_SIMD_SSE 11125+#ifdef __cplusplus 11126+} 11127+#endif 11128+#endif 11129diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h 11130new file mode 100644 11131index 00000000..0a1d21c2 11132--- /dev/null 11133+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h 11134@@ -0,0 +1,129 @@ 11135+/** 11136+ * Copyright 2022 Huawei Technologies Co., Ltd 11137+ * 11138+ * Licensed under the Apache License, Version 2.0 (the "License"); 11139+ * you may not use this file except in compliance with the License. 11140+ * You may obtain a copy of the License at 11141+ * 11142+ * http://www.apache.org/licenses/LICENSE-2.0 11143+ * 11144+ * Unless required by applicable law or agreed to in writing, software 11145+ * distributed under the License is distributed on an "AS IS" BASIS, 11146+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11147+ * See the License for the specific language governing permissions and 11148+ * limitations under the License. 11149+ */ 11150+ 11151+#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_SSE_H_ 11152+#define MINDSPORE_NNACL_ARITHMETIC_SELF_SSE_H_ 11153+ 11154+#include "nnacl/intrinsics/ms_simd_instructions.h" 11155+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11156+ 11157+#ifdef __cplusplus 11158+extern "C" { 11159+#endif 11160+#pragma GCC push_options 11161+#pragma GCC target("sse4.1") 11162+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 11163+#define BLOCK_NUM 4 11164+#define MS_SIMD_SSE 11165+ 11166+#if defined(MS_SIMD_AVX512) 11167+// only avx512 support abs fp32 instruction 11168+static inline int ElementAbsSSE(int index, const float *input, float *output, const int element_size) { 11169+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11170+ SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index))); 11171+ } 11172+ return index; 11173+} 11174+ 11175+static inline int ElementAbsIntSSE(int index, const int *input, int *output, const int element_size) { 11176+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11177+ SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index))); 11178+ } 11179+ return index; 11180+} 11181+#endif 11182+ 11183+static inline int ElementSquareSSE(int index, const float *input, float *output, const int element_size) { 11184+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11185+ SIMD_F32 vin = SIMD_LD_F32(input + index); 11186+ SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin)); 11187+ } 11188+ return index; 11189+} 11190+ 11191+static inline int ElementSqrtSSE(int index, const float *input, float *output, const int element_size) { 11192+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11193+ SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index))); 11194+ } 11195+ return index; 11196+} 11197+ 11198+static inline int ElementRsqrtSSE(int index, const float *input, float *output, const int element_size) { 11199+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11200+ SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index))); 11201+ } 11202+ return index; 11203+} 11204+ 11205+#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE) 11206+// avx512 dont support round fp32 instruction 11207+static inline int ElementRoundSSE(int index, const float *input, float *output, const int element_size) { 11208+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11209+ SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index))); 11210+ } 11211+ return index; 11212+} 11213+#endif 11214+ 11215+#ifndef MS_SIMD_NEON 11216+// neon dont support floor fp32 instruction 11217+static inline int ElementFloorSSE(int index, const float *input, float *output, const int element_size) { 11218+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11219+ SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index))); 11220+ } 11221+ return index; 11222+} 11223+#endif 11224+ 11225+#ifndef MS_SIMD_NEON 11226+static inline int ElementCeilSSE(int index, const float *input, float *output, const int element_size) { 11227+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11228+ SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index))); 11229+ } 11230+ return index; 11231+} 11232+#endif 11233+ 11234+static inline int ElementNegativeSSE(int index, const float *input, float *output, const int element_size) { 11235+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11236+ SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f)); 11237+ } 11238+ return index; 11239+} 11240+ 11241+static inline int ElementNegativeIntSSE(int index, const int *input, int *output, const int element_size) { 11242+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11243+ SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1)); 11244+ } 11245+ return index; 11246+} 11247+ 11248+static inline int ElementReciprocalSSE(int index, const float *input, float *output, const int element_size) { 11249+ SIMD_F32 num1 = SIMD_MOV_F32(1.0f); 11250+ for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11251+ SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index))); 11252+ } 11253+ return index; 11254+} 11255+ 11256+#undef MS_SIMD_INSTRUCTION 11257+#undef BLOCK_NUM 11258+#pragma GCC pop_options 11259+#undef MS_SIMD_SSE 11260+#ifdef __cplusplus 11261+} 11262+#endif 11263+#endif 11264diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h 11265new file mode 100644 11266index 00000000..f04b4e1f 11267--- /dev/null 11268+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h 11269@@ -0,0 +1,67 @@ 11270+/** 11271+ * Copyright 2022 Huawei Technologies Co., Ltd 11272+ * 11273+ * Licensed under the Apache License, Version 2.0 (the "License"); 11274+ * you may not use this file except in compliance with the License. 11275+ * You may obtain a copy of the License at 11276+ * 11277+ * http://www.apache.org/licenses/LICENSE-2.0 11278+ * 11279+ * Unless required by applicable law or agreed to in writing, software 11280+ * distributed under the License is distributed on an "AS IS" BASIS, 11281+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11282+ * See the License for the specific language governing permissions and 11283+ * limitations under the License. 11284+ */ 11285+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ 11286+#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ 11287+ 11288+#include "nnacl/intrinsics/ms_simd_instructions.h" 11289+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11290+ 11291+#ifdef __cplusplus 11292+extern "C" { 11293+#endif 11294+#pragma GCC push_options 11295+#pragma GCC target("sse4.1") 11296+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 11297+#define BLOCK_NUM 4 11298+#define MS_SIMD_SSE 11299+ 11300+static inline int BatchNormFp32SSE(int index, const float *input, const float *mean, 11301+ const float *variance, int channel, float epsilon, float *output) { 11302+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11303+ SIMD_F32 input_data = SIMD_LD_F32(input + index); 11304+ SIMD_F32 mean_ = SIMD_LD_F32(mean + index); 11305+ SIMD_F32 variance_ = SIMD_LD_F32(variance + index); 11306+ SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); 11307+ SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); 11308+ SIMD_ST_F32(output + index, output_data); 11309+ } 11310+ return index; 11311+} 11312+ 11313+static inline int FusedBatchNormFp32SSE(int index, const float *input, const float *scale, 11314+ const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) { 11315+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11316+ SIMD_F32 input_data = SIMD_LD_F32(input + index); 11317+ SIMD_F32 scale_ = SIMD_LD_F32(scale + index); 11318+ SIMD_F32 offset_ = SIMD_LD_F32(offset + index); 11319+ SIMD_F32 mean_ = SIMD_LD_F32(mean + index); 11320+ SIMD_F32 variance_ = SIMD_LD_F32(variance + index); 11321+ SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); 11322+ SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); 11323+ SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_); 11324+ SIMD_ST_F32(output + index, output_data); 11325+ } 11326+ return index; 11327+} 11328+ 11329+#undef MS_SIMD_INSTRUCTION 11330+#undef BLOCK_NUM 11331+#pragma GCC pop_options 11332+#undef MS_SIMD_SSE 11333+#ifdef __cplusplus 11334+} 11335+#endif 11336+#endif 11337diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h 11338new file mode 100644 11339index 00000000..c929ccaf 11340--- /dev/null 11341+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h 11342@@ -0,0 +1,69 @@ 11343+/** 11344+ * Copyright 2022 Huawei Technologies Co., Ltd 11345+ * 11346+ * Licensed under the Apache License, Version 2.0 (the "License"); 11347+ * you may not use this file except in compliance with the License. 11348+ * You may obtain a copy of the License at 11349+ * 11350+ * http://www.apache.org/licenses/LICENSE-2.0 11351+ * 11352+ * Unless required by applicable law or agreed to in writing, software 11353+ * distributed under the License is distributed on an "AS IS" BASIS, 11354+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11355+ * See the License for the specific language governing permissions and 11356+ * limitations under the License. 11357+ */ 11358+#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_SSE_H_ 11359+#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_SSE_H_ 11360+ 11361+#include "nnacl/intrinsics/ms_simd_instructions.h" 11362+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11363+ 11364+#ifdef __cplusplus 11365+extern "C" { 11366+#endif 11367+#pragma GCC push_options 11368+#pragma GCC target("sse4.1") 11369+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 11370+#define BLOCK_NUM 4 11371+#define MS_SIMD_SSE 11372+ 11373+static inline int BCEWithLogitLossSSE(int index, const float *logits, const float *label, 11374+ const float *weight, const float *pos_weight, int length, bool reduction, float *output, 11375+ float *reduction_sum) { 11376+ SIMD_F32 zero = SIMD_SET0_F32; 11377+ SIMD_F32 ones = SIMD_MOV_F32(1.0f); 11378+ SIMD_F32 middle_output = SIMD_SET0_F32; 11379+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11380+ SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index); 11381+ SIMD_F32 label_tmp = SIMD_LD_F32(label + index); 11382+ SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index); 11383+ SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index); 11384+ SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp); 11385+ SIMD_F32 max_value = neg_logits_tmp; 11386+ max_value = SIMD_MIN_F32(max_value, zero); 11387+ SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value); 11388+ SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones); 11389+ SIMD_F32 log_exp_value = 11390+ SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value)))); 11391+ SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp), 11392+ SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value))); 11393+ if (reduction) { 11394+ middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output); 11395+ } else { 11396+ SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp)); 11397+ } 11398+ } 11399+ if (reduction) { 11400+ *reduction_sum += SIMD_GET_SUM_F32(middle_output); 11401+ } 11402+ return index; 11403+} 11404+#undef MS_SIMD_INSTRUCTION 11405+#undef BLOCK_NUM 11406+#pragma GCC pop_options 11407+#undef MS_SIMD_SSE 11408+#ifdef __cplusplus 11409+} 11410+#endif 11411+#endif 11412diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h 11413new file mode 100644 11414index 00000000..0544d239 11415--- /dev/null 11416+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h 11417@@ -0,0 +1,64 @@ 11418+/** 11419+ * Copyright 2022 Huawei Technologies Co., Ltd 11420+ * 11421+ * Licensed under the Apache License, Version 2.0 (the "License"); 11422+ * you may not use this file except in compliance with the License. 11423+ * You may obtain a copy of the License at 11424+ * 11425+ * http://www.apache.org/licenses/LICENSE-2.0 11426+ * 11427+ * Unless required by applicable law or agreed to in writing, software 11428+ * distributed under the License is distributed on an "AS IS" BASIS, 11429+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11430+ * See the License for the specific language governing permissions and 11431+ * limitations under the License. 11432+ */ 11433+ 11434+#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_SSE_H_ 11435+#define MINDSPORE_NNACL_FP32_BIAS_ADD_SSE_H_ 11436+ 11437+#include "nnacl/intrinsics/ms_simd_instructions.h" 11438+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11439+ 11440+#ifdef __cplusplus 11441+extern "C" { 11442+#endif 11443+#pragma GCC push_options 11444+#pragma GCC target("sse4.1") 11445+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 11446+#define BLOCK_NUM 4 11447+#define MS_SIMD_SSE 11448+ 11449+static inline int BiasAddByInnerCoreSSE(int index, const float *input, const float *bias, float *output, 11450+ int64_t num) { 11451+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11452+ SIMD_F32 vin0 = SIMD_LD_F32(input + index); 11453+ SIMD_F32 vin1 = SIMD_LD_F32(bias + index); 11454+ SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); 11455+ SIMD_ST_F32(output + index, vout); 11456+ } 11457+ return index; 11458+} 11459+ 11460+static inline int BiasAddByBatchCoreSSE(int index, const float *input, const float *bias, float *output1, 11461+ float *output2, float *output3, float *output4, int64_t num) { 11462+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11463+ SIMD_LDX4_F32(input_data, input + index, num); 11464+ SIMD_F32 bias_data = SIMD_LD_F32(bias + index); 11465+ SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data)); 11466+ SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data)); 11467+ SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data)); 11468+ SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data)); 11469+ } 11470+ return index; 11471+} 11472+ 11473+#undef MS_SIMD_INSTRUCTION 11474+#undef BLOCK_NUM 11475+#pragma GCC pop_options 11476+#undef MS_SIMD_SSE 11477+#ifdef __cplusplus 11478+}; 11479+#endif 11480+ 11481+#endif // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_ 11482diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h 11483new file mode 100644 11484index 00000000..4eca209f 11485--- /dev/null 11486+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h 11487@@ -0,0 +1,56 @@ 11488+/** 11489+ * Copyright 2022 Huawei Technologies Co., Ltd 11490+ * 11491+ * Licensed under the Apache License, Version 2.0 (the "License"); 11492+ * you may not use this file except in compliance with the License. 11493+ * You may obtain a copy of the License at 11494+ * 11495+ * http://www.apache.org/licenses/LICENSE-2.0 11496+ * 11497+ * Unless required by applicable law or agreed to in writing, software 11498+ * distributed under the License is distributed on an "AS IS" BASIS, 11499+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11500+ * See the License for the specific language governing permissions and 11501+ * limitations under the License. 11502+ */ 11503+#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_SSE_H_ 11504+#define MINDSPORE_NNACL_BASE_CAST_BASE_SSE_H_ 11505+ 11506+#include "nnacl/intrinsics/ms_simd_instructions.h" 11507+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11508+ 11509+#ifdef __cplusplus 11510+extern "C" { 11511+#endif 11512+#pragma GCC push_options 11513+#pragma GCC target("sse4.1") 11514+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 11515+#define BLOCK_NUM 4 11516+#define MS_SIMD_SSE 11517+ 11518+static inline int Int32ToFloat32SSE(int index, const int32_t *input, float *output, int number) { 11519+ for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11520+ SIMD_EPI32 value = SIMD_LD_EPI32(input + index); 11521+ SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value)); 11522+ } 11523+ return index; 11524+} 11525+ 11526+#ifndef MS_SIMD_NEON 11527+static inline int Float32ToInt32SSE(int index, const float *input, int32_t *output, int number) { 11528+ for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11529+ SIMD_F32 value = SIMD_LD_F32(input + index); 11530+ SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value)); 11531+ } 11532+ return index; 11533+} 11534+#endif 11535+ 11536+#undef MS_SIMD_INSTRUCTION 11537+#undef BLOCK_NUM 11538+#pragma GCC pop_options 11539+#undef MS_SIMD_SSE 11540+#ifdef __cplusplus 11541+} 11542+#endif 11543+#endif 11544diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h 11545new file mode 100644 11546index 00000000..3d116113 11547--- /dev/null 11548+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h 11549@@ -0,0 +1,70 @@ 11550+/** 11551+ * Copyright 2022 Huawei Technologies Co., Ltd 11552+ * 11553+ * Licensed under the Apache License, Version 2.0 (the "License"); 11554+ * you may not use this file except in compliance with the License. 11555+ * You may obtain a copy of the License at 11556+ * 11557+ * http://www.apache.org/licenses/LICENSE-2.0 11558+ * 11559+ * Unless required by applicable law or agreed to in writing, software 11560+ * distributed under the License is distributed on an "AS IS" BASIS, 11561+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11562+ * See the License for the specific language governing permissions and 11563+ * limitations under the License. 11564+ */ 11565+#ifndef MINDSPORE_NNACL_FP32_CDIST_SSE_H_ 11566+#define MINDSPORE_NNACL_FP32_CDIST_SSE_H_ 11567+ 11568+#include "nnacl/intrinsics/ms_simd_instructions.h" 11569+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11570+ 11571+#ifdef __cplusplus 11572+extern "C" { 11573+#endif 11574+#pragma GCC push_options 11575+#pragma GCC target("sse4.1") 11576+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 11577+#define BLOCK_NUM 4 11578+#define MS_SIMD_SSE 11579+ 11580+static inline int64_t CdistTwoNormalOptSSE(int64_t index, const float *a, const float *b, 11581+ float *out, int64_t size) { 11582+ SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); 11583+ for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11584+ SIMD_F32 a_vec = SIMD_LD_F32(a + index); 11585+ SIMD_F32 b_vec = SIMD_LD_F32(b + index); 11586+ SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); 11587+ tmp_vec = SIMD_ABS_F32(tmp_vec); 11588+ result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec); 11589+ } 11590+ *out += SIMD_GET_SUM_F32(result_vec); 11591+ 11592+ return index; 11593+} 11594+ 11595+static inline int64_t CdistPNormalOptSSE(int64_t index, const float *a, const float *b, 11596+ float *out, int64_t size, float p) { 11597+ SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); 11598+ SIMD_F32 p_vec = SIMD_MOV_F32(p); 11599+ for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11600+ SIMD_F32 a_vec = SIMD_LD_F32(a + index); 11601+ SIMD_F32 b_vec = SIMD_LD_F32(b + index); 11602+ SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); 11603+ tmp_vec = SIMD_ABS_F32(tmp_vec); 11604+ tmp_vec = SIMD_POW_F32(tmp_vec, p_vec); 11605+ result_vec = SIMD_ADD_F32(tmp_vec, result_vec); 11606+ } 11607+ *out += SIMD_GET_SUM_F32(result_vec); 11608+ 11609+ return index; 11610+} 11611+ 11612+#undef MS_SIMD_INSTRUCTION 11613+#undef BLOCK_NUM 11614+#pragma GCC pop_options 11615+#undef MS_SIMD_SSE 11616+#ifdef __cplusplus 11617+} 11618+#endif 11619+#endif 11620diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h 11621new file mode 100644 11622index 00000000..1b67143f 11623--- /dev/null 11624+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h 11625@@ -0,0 +1,121 @@ 11626+/** 11627+ * Copyright 2022 Huawei Technologies Co., Ltd 11628+ * 11629+ * Licensed under the Apache License, Version 2.0 (the "License"); 11630+ * you may not use this file except in compliance with the License. 11631+ * You may obtain a copy of the License at 11632+ * 11633+ * http://www.apache.org/licenses/LICENSE-2.0 11634+ * 11635+ * Unless required by applicable law or agreed to in writing, software 11636+ * distributed under the License is distributed on an "AS IS" BASIS, 11637+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11638+ * See the License for the specific language governing permissions and 11639+ * limitations under the License. 11640+ */ 11641+#ifndef MINDSPORE_NNACL_FP32_CUMSUM_SSE_H_ 11642+#define MINDSPORE_NNACL_FP32_CUMSUM_SSE_H_ 11643+ 11644+#include "nnacl/intrinsics/ms_simd_instructions.h" 11645+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11646+ 11647+#ifdef __cplusplus 11648+extern "C" { 11649+#endif 11650+#pragma GCC push_options 11651+#pragma GCC target("sse4.1") 11652+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 11653+#define BLOCK_NUM 4 11654+#define MS_SIMD_SSE 11655+ 11656+// (a, b, c) -> (a, a+b, a+b+c) exclusive == false 11657+// (a, b, c) -> (0, a, a+b) exclusive == true 11658+static inline int64_t CumsumOutputInitWithInputSSE(int64_t index, const float *layer_input, 11659+ float *layer_output, int inner_dim) { 11660+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11661+ SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index)); 11662+ } 11663+ return index; 11664+} 11665+ 11666+static inline int64_t CumsumOutputInitWithZeroSSE(int64_t index, float *layer_output, int inner_dim) { 11667+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11668+ SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f)); 11669+ } 11670+ return index; 11671+} 11672+ 11673+static inline int64_t CumsumSSE(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output, 11674+ int inner_dim) { 11675+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11676+ SIMD_F32 input_val = SIMD_LD_F32(layer_input + index); 11677+ SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index); 11678+ SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); 11679+ SIMD_ST_F32(layer_output + index, out_val); 11680+ } 11681+ return index; 11682+} 11683+ 11684+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false 11685+// (a, b, c) -> (c+b, c, 0) exclusive==true 11686+static inline int64_t CumsumReverseSSE(int64_t index, const float *layer_input, float *layer_output, 11687+ float *layer_last_output, int inner_dim) { 11688+ 11689+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11690+ SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1); 11691+ SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1); 11692+ SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); 11693+ SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val); 11694+ } 11695+ return index; 11696+} 11697+ 11698+// (a, b, c) -> (a, a+b, a+b+c) exclusive == false 11699+// (a, b, c) -> (0, a, a+b) exclusive == true 11700+static inline int64_t CumsumIntOutputInitWithInputSSE(int64_t index, const int *layer_input, 11701+ int *layer_output, int inner_dim) { 11702+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11703+ SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index)); 11704+ } 11705+ return index; 11706+} 11707+ 11708+static inline int64_t CumsumIntOutputInitWithZeroSSE(int64_t index, int *layer_output, int inner_dim) { 11709+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11710+ SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f)); 11711+ } 11712+ return index; 11713+} 11714+ 11715+static inline int64_t CumsumIntSSE(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, 11716+ int inner_dim) { 11717+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11718+ SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index); 11719+ SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index); 11720+ SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); 11721+ SIMD_ST_EPI32(layer_output + index, out_val); 11722+ } 11723+ return index; 11724+} 11725+ 11726+// (a, b, c) -> (c+b+a, c+b, c) exclusive==false 11727+// (a, b, c) -> (c+b, c, 0) exclusive==true 11728+static inline int64_t CumsumReverseIntSSE(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, 11729+ int inner_dim) { 11730+ for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11731+ SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1); 11732+ SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1); 11733+ SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); 11734+ SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val); 11735+ } 11736+ return index; 11737+} 11738+ 11739+#undef MS_SIMD_INSTRUCTION 11740+#undef BLOCK_NUM 11741+#pragma GCC pop_options 11742+#undef MS_SIMD_SSE 11743+#ifdef __cplusplus 11744+} 11745+#endif 11746+#endif 11747diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h 11748new file mode 100644 11749index 00000000..5f0c6009 11750--- /dev/null 11751+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h 11752@@ -0,0 +1,167 @@ 11753+/** 11754+ * Copyright 2022 Huawei Technologies Co., Ltd 11755+ * 11756+ * Licensed under the Apache License, Version 2.0 (the "License"); 11757+ * you may not use this file except in compliance with the License. 11758+ * You may obtain a copy of the License at 11759+ * 11760+ * http://www.apache.org/licenses/LICENSE-2.0 11761+ * 11762+ * Unless required by applicable law or agreed to in writing, software 11763+ * distributed under the License is distributed on an "AS IS" BASIS, 11764+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11765+ * See the License for the specific language governing permissions and 11766+ * limitations under the License. 11767+ */ 11768+ 11769+#ifndef MINDSPORE_NNACL_FP32_DIV_SSE_H_ 11770+#define MINDSPORE_NNACL_FP32_DIV_SSE_H_ 11771+ 11772+#include "nnacl/intrinsics/ms_simd_instructions.h" 11773+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11774+ 11775+#ifdef __cplusplus 11776+extern "C" { 11777+#endif 11778+#pragma GCC push_options 11779+#pragma GCC target("sse4.1") 11780+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 11781+#define BLOCK_NUM 4 11782+#define MS_SIMD_SSE 11783+ 11784+static inline int ElementOptDivNum0SSE(int index, const float *in0, const float *in1, float *out, 11785+ int size) { 11786+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 11787+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11788+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 11789+ SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1); 11790+ SIMD_ST_F32(out + index, vout); 11791+ } 11792+ return index; 11793+} 11794+ 11795+static inline int ElementOptDivNum1SSE(int index, const float *in0, const float *in1, float *out, 11796+ int size) { 11797+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 11798+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11799+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 11800+ SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_); 11801+ SIMD_ST_F32(out + index, vout); 11802+ } 11803+ return index; 11804+} 11805+ 11806+static inline int ElementOptDivIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { 11807+ SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); 11808+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11809+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 11810+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1); 11811+ SIMD_ST_EPI32(out + index, vout); 11812+ } 11813+ return index; 11814+} 11815+ 11816+static inline int ElementOptDivIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { 11817+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 11818+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11819+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 11820+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_); 11821+ SIMD_ST_EPI32(out + index, vout); 11822+ } 11823+ return index; 11824+} 11825+ 11826+static inline int ElementOptDivReluNum0SSE(int index, const float *in0, const float *in1, float *out, 11827+ int size) { 11828+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 11829+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11830+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 11831+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f); 11832+ SIMD_ST_F32(out + index, vout); 11833+ } 11834+ return index; 11835+} 11836+ 11837+static inline int ElementOptDivReluNum1SSE(int index, const float *in0, const float *in1, float *out, 11838+ int size) { 11839+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 11840+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11841+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 11842+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f); 11843+ SIMD_ST_F32(out + index, vout); 11844+ } 11845+ return index; 11846+} 11847+ 11848+static inline int ElementOptDivRelu6Num0SSE(int index, const float *in0, const float *in1, float *out, 11849+ int size) { 11850+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 11851+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11852+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 11853+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f); 11854+ SIMD_ST_F32(out + index, vout); 11855+ } 11856+ return index; 11857+} 11858+ 11859+static inline int ElementOptDivRelu6Num1SSE(int index, const float *in0, const float *in1, float *out, 11860+ int size) { 11861+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 11862+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11863+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 11864+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f); 11865+ SIMD_ST_F32(out + index, vout); 11866+ } 11867+ return index; 11868+} 11869+ 11870+static inline int ElementDivSSE(int index, const float *in0, const float *in1, float *out, int size) { 11871+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11872+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 11873+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 11874+ SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1); 11875+ SIMD_ST_F32(out + index, vout); 11876+ } 11877+ return index; 11878+} 11879+ 11880+static inline int ElementDivIntSSE(int index, const int *in0, const int *in1, int *out, int size) { 11881+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11882+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 11883+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 11884+ SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1); 11885+ SIMD_ST_EPI32(out + index, vout); 11886+ } 11887+ return index; 11888+} 11889+ 11890+static inline int ElementDivReluSSE(int index, const float *in0, const float *in1, float *out, 11891+ int size) { 11892+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11893+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 11894+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 11895+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f); 11896+ SIMD_ST_F32(out + index, vout); 11897+ } 11898+ return index; 11899+} 11900+ 11901+static inline int ElementDivRelu6SSE(int index, const float *in0, const float *in1, float *out, 11902+ int size) { 11903+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11904+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 11905+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 11906+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f); 11907+ SIMD_ST_F32(out + index, vout); 11908+ } 11909+ return index; 11910+} 11911+ 11912+#undef MS_SIMD_INSTRUCTION 11913+#undef BLOCK_NUM 11914+#pragma GCC pop_options 11915+#undef MS_SIMD_SSE 11916+#ifdef __cplusplus 11917+}; 11918+#endif 11919+#endif 11920diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h 11921new file mode 100644 11922index 00000000..2429ed38 11923--- /dev/null 11924+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h 11925@@ -0,0 +1,46 @@ 11926+/** 11927+ * Copyright 2022 Huawei Technologies Co., Ltd 11928+ * 11929+ * Licensed under the Apache License, Version 2.0 (the "License"); 11930+ * you may not use this file except in compliance with the License. 11931+ * You may obtain a copy of the License at 11932+ * 11933+ * http://www.apache.org/licenses/LICENSE-2.0 11934+ * 11935+ * Unless required by applicable law or agreed to in writing, software 11936+ * distributed under the License is distributed on an "AS IS" BASIS, 11937+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11938+ * See the License for the specific language governing permissions and 11939+ * limitations under the License. 11940+ */ 11941+#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_SSE_H_ 11942+#define MINDSPORE_NNACL_FP32_DROPOUTFP32_SSE_H_ 11943+ 11944+#include "nnacl/intrinsics/ms_simd_instructions.h" 11945+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11946+ 11947+#ifdef __cplusplus 11948+extern "C" { 11949+#endif 11950+#pragma GCC push_options 11951+#pragma GCC target("sse4.1") 11952+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 11953+#define BLOCK_NUM 4 11954+#define MS_SIMD_SSE 11955+ 11956+static inline int DropoutFp32SSE(int index, const float *input, float scale, 11957+ int length, float *output) { 11958+ SIMD_F32 scale_value = SIMD_MOV_F32(scale); 11959+ for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 11960+ SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value)); 11961+ } 11962+ return index; 11963+} 11964+#undef MS_SIMD_INSTRUCTION 11965+#undef BLOCK_NUM 11966+#pragma GCC pop_options 11967+#undef MS_SIMD_SSE 11968+#ifdef __cplusplus 11969+} 11970+#endif 11971+#endif 11972diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h 11973new file mode 100644 11974index 00000000..3d802fb3 11975--- /dev/null 11976+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h 11977@@ -0,0 +1,63 @@ 11978+/** 11979+ * Copyright 2022 Huawei Technologies Co., Ltd 11980+ * 11981+ * Licensed under the Apache License, Version 2.0 (the "License"); 11982+ * you may not use this file except in compliance with the License. 11983+ * You may obtain a copy of the License at 11984+ * 11985+ * http://www.apache.org/licenses/LICENSE-2.0 11986+ * 11987+ * Unless required by applicable law or agreed to in writing, software 11988+ * distributed under the License is distributed on an "AS IS" BASIS, 11989+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11990+ * See the License for the specific language governing permissions and 11991+ * limitations under the License. 11992+ */ 11993+ 11994+#ifndef MINDSPORE_NNACL_FP32_DIV_SSE_H_ 11995+#define MINDSPORE_NNACL_FP32_DIV_SSE_H_ 11996+ 11997+#include "nnacl/intrinsics/ms_simd_instructions.h" 11998+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 11999+ 12000+#ifdef __cplusplus 12001+extern "C" { 12002+#endif 12003+#pragma GCC push_options 12004+#pragma GCC target("sse4.1") 12005+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12006+#define BLOCK_NUM 4 12007+#define MS_SIMD_SSE 12008+ 12009+static inline int64_t ExpFp32SSE(int64_t index, const float *src, float *dst, int num) { 12010+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12011+ SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); 12012+ } 12013+ return index; 12014+} 12015+ 12016+static inline int64_t ExpFp32WithInScaleSSE(int64_t index, const float *src, float *dst, int num, float in_scale) { 12017+ SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale); 12018+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12019+ SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index); 12020+ } 12021+ return index; 12022+} 12023+ 12024+static inline int64_t ExpFp32WithOutScaleSSE(int64_t index, const float *src, float *dst, int num, float out_scale) { 12025+ SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale); 12026+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12027+ SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); 12028+ SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec)); 12029+ } 12030+ return index; 12031+} 12032+ 12033+#undef MS_SIMD_INSTRUCTION 12034+#undef BLOCK_NUM 12035+#pragma GCC pop_options 12036+#undef MS_SIMD_SSE 12037+#ifdef __cplusplus 12038+}; 12039+#endif 12040+#endif 12041diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h 12042new file mode 100644 12043index 00000000..9c71eefb 12044--- /dev/null 12045+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h 12046@@ -0,0 +1,53 @@ 12047+/** 12048+ * Copyright 2022 Huawei Technologies Co., Ltd 12049+ * 12050+ * Licensed under the Apache License, Version 2.0 (the "License"); 12051+ * you may not use this file except in compliance with the License. 12052+ * You may obtain a copy of the License at 12053+ * 12054+ * http://www.apache.org/licenses/LICENSE-2.0 12055+ * 12056+ * Unless required by applicable law or agreed to in writing, software 12057+ * distributed under the License is distributed on an "AS IS" BASIS, 12058+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12059+ * See the License for the specific language governing permissions and 12060+ * limitations under the License. 12061+ */ 12062+#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_SSE_H_ 12063+#define MINDSPORE_NNACL_BASE_FILL_BASE_SSE_H_ 12064+ 12065+#include "nnacl/intrinsics/ms_simd_instructions.h" 12066+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 12067+ 12068+#ifdef __cplusplus 12069+extern "C" { 12070+#endif 12071+#pragma GCC push_options 12072+#pragma GCC target("sse4.1") 12073+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12074+#define BLOCK_NUM 4 12075+#define MS_SIMD_SSE 12076+ 12077+static inline int FillFp32SSE(int index, float *output, int size, float data) { 12078+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12079+ SIMD_ST_F32(output + index, SIMD_MOV_F32(data)); 12080+ } 12081+ return index; 12082+} 12083+ 12084+static inline int FillInt32SSE(int index, int *output, int size, int data) { 12085+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12086+ SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data)); 12087+ } 12088+ return index; 12089+} 12090+ 12091+#undef MS_SIMD_INSTRUCTION 12092+#undef BLOCK_NUM 12093+#pragma GCC pop_options 12094+#undef MS_SIMD_SSE 12095+#ifdef __cplusplus 12096+} 12097+#endif 12098+#endif 12099+ 12100diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h 12101new file mode 100644 12102index 00000000..1c1f57da 12103--- /dev/null 12104+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h 12105@@ -0,0 +1,77 @@ 12106+/** 12107+ * Copyright 2022 Huawei Technologies Co., Ltd 12108+ * 12109+ * Licensed under the Apache License, Version 2.0 (the "License"); 12110+ * you may not use this file except in compliance with the License. 12111+ * You may obtain a copy of the License at 12112+ * 12113+ * http://www.apache.org/licenses/LICENSE-2.0 12114+ * 12115+ * Unless required by applicable law or agreed to in writing, software 12116+ * distributed under the License is distributed on an "AS IS" BASIS, 12117+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12118+ * See the License for the specific language governing permissions and 12119+ * limitations under the License. 12120+ */ 12121+#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_SSE_H_ 12122+#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_SSE_H_ 12123+ 12124+#include "nnacl/intrinsics/ms_simd_instructions.h" 12125+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 12126+ 12127+#ifdef __cplusplus 12128+extern "C" { 12129+#endif 12130+#pragma GCC push_options 12131+#pragma GCC target("sse4.1") 12132+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12133+#define BLOCK_NUM 4 12134+#define MS_SIMD_SSE 12135+ 12136+static inline int64_t GroupNormFp32SSE(int64_t index, const float *unit_input, float scale, float offset, float mean, 12137+ float var_sqrt, int unit, float *unit_output) { 12138+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 12139+ SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt); 12140+ SIMD_F32 scale_val = SIMD_MOV_F32(scale); 12141+ SIMD_F32 offset_val = SIMD_MOV_F32(offset); 12142+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12143+ SIMD_F32 input = SIMD_LD_F32(unit_input + index); 12144+ SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt); 12145+ SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val); 12146+ SIMD_ST_F32(unit_output + index, output); 12147+ } 12148+ return index; 12149+} 12150+ 12151+static inline int64_t GroupNormReduceSumSSE(int64_t index, const float *in, float *sum, int unit) { 12152+ if (unit - index >= 4 * BLOCK_NUM) { 12153+ SIMD_F32 tmp = SIMD_MOV_F32(0); 12154+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12155+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index)); 12156+ } 12157+ *sum += SIMD_GET_SUM_F32(tmp); 12158+ } 12159+ return index; 12160+} 12161+ 12162+static inline int64_t GroupNormReduceVarSSE(int64_t index, const float *in, float mean, float *sum, int unit) { 12163+ if (unit - index >= 4 * BLOCK_NUM) { 12164+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 12165+ SIMD_F32 tmp = SIMD_MOV_F32(0); 12166+ for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12167+ SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val); 12168+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input)); 12169+ } 12170+ *sum += SIMD_GET_SUM_F32(tmp); 12171+ } 12172+ return index; 12173+} 12174+ 12175+#undef MS_SIMD_INSTRUCTION 12176+#undef BLOCK_NUM 12177+#pragma GCC pop_options 12178+#undef MS_SIMD_SSE 12179+#ifdef __cplusplus 12180+} 12181+#endif 12182+#endif 12183diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h 12184new file mode 100644 12185index 00000000..30af87c3 12186--- /dev/null 12187+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h 12188@@ -0,0 +1,68 @@ 12189+/** 12190+ * Copyright 2022 Huawei Technologies Co., Ltd 12191+ * 12192+ * Licensed under the Apache License, Version 2.0 (the "License"); 12193+ * you may not use this file except in compliance with the License. 12194+ * You may obtain a copy of the License at 12195+ * 12196+ * http://www.apache.org/licenses/LICENSE-2.0 12197+ * 12198+ * Unless required by applicable law or agreed to in writing, software 12199+ * distributed under the License is distributed on an "AS IS" BASIS, 12200+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12201+ * See the License for the specific language governing permissions and 12202+ * limitations under the License. 12203+ */ 12204+#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_SSE_H_ 12205+#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_SSE_H_ 12206+ 12207+#include "nnacl/intrinsics/ms_simd_instructions.h" 12208+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 12209+ 12210+#ifdef __cplusplus 12211+extern "C" { 12212+#endif 12213+#pragma GCC push_options 12214+#pragma GCC target("sse4.1") 12215+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12216+#define BLOCK_NUM 4 12217+#define MS_SIMD_SSE 12218+ 12219+static inline int LayerNormMeanAndSquareSSE(int index, const float *src, int num, float *mean, float *square_mean) { 12220+ if (num >= 4 * BLOCK_NUM) { 12221+ SIMD_F32 sum_val = SIMD_SET0_F32; 12222+ SIMD_F32 square_sum_val = SIMD_SET0_F32; 12223+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12224+ SIMD_F32 value = SIMD_LD_F32(src + index); 12225+ SIMD_F32 square_value = SIMD_MUL_F32(value, value); 12226+ sum_val = SIMD_ADD_F32(sum_val, value); 12227+ square_sum_val = SIMD_ADD_F32(square_sum_val, square_value); 12228+ } 12229+ *mean += SIMD_GET_SUM_F32(sum_val); 12230+ *square_mean += SIMD_GET_SUM_F32(square_sum_val); 12231+ } 12232+ return index; 12233+} 12234+ 12235+static inline int LayerNormGammaAndBetaSSE(int index, float *dst, const float *src, const float *gamma_data, 12236+ const float *beta_data, int num, const float mean, const float deno) { 12237+ SIMD_F32 mean_val = SIMD_MOV_F32(mean); 12238+ SIMD_F32 deno_val = SIMD_MOV_F32(deno); 12239+ for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12240+ SIMD_F32 value = SIMD_LD_F32(src + index); 12241+ SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val); 12242+ out_value = SIMD_MUL_F32(out_value, deno_val); 12243+ out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index)); 12244+ SIMD_ST_F32(dst + index, out_value); 12245+ } 12246+ return index; 12247+} 12248+ 12249+#undef MS_SIMD_INSTRUCTION 12250+#undef BLOCK_NUM 12251+#pragma GCC pop_options 12252+#undef MS_SIMD_SSE 12253+#ifdef __cplusplus 12254+} 12255+#endif 12256+#endif 12257diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h 12258new file mode 100644 12259index 00000000..aef5b2a1 12260--- /dev/null 12261+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h 12262@@ -0,0 +1,93 @@ 12263+/** 12264+ * Copyright 2022 Huawei Technologies Co., Ltd 12265+ * 12266+ * Licensed under the Apache License, Version 2.0 (the "License"); 12267+ * you may not use this file except in compliance with the License. 12268+ * You may obtain a copy of the License at 12269+ * 12270+ * http://www.apache.org/licenses/LICENSE-2.0 12271+ * 12272+ * Unless required by applicable law or agreed to in writing, software 12273+ * distributed under the License is distributed on an "AS IS" BASIS, 12274+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12275+ * See the License for the specific language governing permissions and 12276+ * limitations under the License. 12277+ */ 12278+#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_SSE_H_ 12279+#define MINDSPORE_NNACL_FP32_MATMUL_F32_SSE_H_ 12280+ 12281+#include "nnacl/intrinsics/ms_simd_instructions.h" 12282+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 12283+ 12284+#ifdef __cplusplus 12285+extern "C" { 12286+#endif 12287+#pragma GCC push_options 12288+#pragma GCC target("sse4.1") 12289+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12290+#define BLOCK_NUM 4 12291+#define MS_SIMD_SSE 12292+ 12293+// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6. 12294+static inline int64_t GemmIsNotPackSSE(int64_t index, const float *a, const float *b, float *c, const float *bias, int row, 12295+ int deep, int act_type) { 12296+ SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f); 12297+ SIMD_F32 up_threshold = SIMD_MOV_F32(6); 12298+ SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]); 12299+ SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]); 12300+ for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12301+ SIMD_F32 a_data = SIMD_LD_F32(a + index); 12302+ SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16); 12303+ if (act_type != 0) { 12304+ dst = SIMD_MAX_F32(dst, down_threshold); 12305+ if (act_type == 3) { 12306+ dst = SIMD_MIN_F32(dst, up_threshold); 12307+ } 12308+ } 12309+ SIMD_ST_F32(c + index, dst); 12310+ } 12311+ 12312+ return index; 12313+} 12314+ 12315+#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX) 12316+static inline int64_t GemmIsNotPackOptimizeCoreSSE(int64_t index, const float *a, const float *b, int k, float *dst) { 12317+ SIMD_F32 dst1 = SIMD_MOV_F32(0.0f); 12318+ for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12319+ SIMD_F32 weight = SIMD_LD_F32(b + index); 12320+ SIMD_F32 a1 = SIMD_LD_F32(a + index); 12321+ dst1 = SIMD_FMADD_F32(weight, a1, dst1); 12322+ } 12323+ *dst += SIMD_REDUCE_ADD_F32(dst1); 12324+ return index; 12325+} 12326+#endif 12327+ 12328+static inline int64_t MatVecMulNoPackCoreSSE(int64_t oc_index, const float *a, const float *b, float *c, const float *bias, 12329+ int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) { 12330+ for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) { 12331+ SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index)); 12332+ for (int64_t k = 0; k < depth; ++k) { 12333+ SIMD_F32 left = SIMD_MOV_F32(a[k]); 12334+ SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col); 12335+ out = SIMD_FMADD_F32(left, right, out); 12336+ } 12337+ if ((inc_flag & 0x2) != 0 && act_type != 0) { 12338+ out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f)); 12339+ if (act_type == 0x3) { 12340+ out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f)); 12341+ } 12342+ } 12343+ SIMD_ST_F32(c + oc_index, out); 12344+ } 12345+ return oc_index; 12346+} 12347+ 12348+#undef MS_SIMD_INSTRUCTION 12349+#undef BLOCK_NUM 12350+#pragma GCC pop_options 12351+#undef MS_SIMD_SSE 12352+#ifdef __cplusplus 12353+} 12354+#endif 12355+#endif 12356diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h 12357new file mode 100644 12358index 00000000..e3dd4582 12359--- /dev/null 12360+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h 12361@@ -0,0 +1,218 @@ 12362+/** 12363+ * Copyright 2022 Huawei Technologies Co., Ltd 12364+ * 12365+ * Licensed under the Apache License, Version 2.0 (the "License"); 12366+ * you may not use this file except in compliance with the License. 12367+ * You may obtain a copy of the License at 12368+ * 12369+ * http://www.apache.org/licenses/LICENSE-2.0 12370+ * 12371+ * Unless required by applicable law or agreed to in writing, software 12372+ * distributed under the License is distributed on an "AS IS" BASIS, 12373+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12374+ * See the License for the specific language governing permissions and 12375+ * limitations under the License. 12376+ */ 12377+#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ 12378+#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ 12379+ 12380+#include "nnacl/intrinsics/ms_simd_instructions.h" 12381+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 12382+ 12383+#ifdef __cplusplus 12384+extern "C" { 12385+#endif 12386+#pragma GCC push_options 12387+#pragma GCC target("sse4.1") 12388+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12389+#define BLOCK_NUM 4 12390+#define MS_SIMD_SSE 12391+ 12392+static inline int ElementMulSSE(int index, const float *in0, const float *in1, float *out, int size) { 12393+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12394+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 12395+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 12396+ SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1); 12397+ SIMD_ST_F32(out + index, vout); 12398+ } 12399+ return index; 12400+} 12401+ 12402+static inline int ElementMulReluSSE(int index, const float *in0, const float *in1, float *out, int size) { 12403+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12404+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 12405+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 12406+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f); 12407+ SIMD_ST_F32(out + index, vout); 12408+ } 12409+ return index; 12410+} 12411+ 12412+static inline int ElementMulRelu6SSE(int index, const float *in0, const float *in1, float *out, int size) { 12413+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12414+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 12415+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 12416+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f); 12417+ SIMD_ST_F32(out + index, vout); 12418+ } 12419+ return index; 12420+} 12421+ 12422+static inline int ElementMulIntSSE(int index, const int *in0, const int *in1, int *out, int size) { 12423+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12424+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 12425+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 12426+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1); 12427+ SIMD_ST_EPI32(out + index, vout); 12428+ } 12429+ return index; 12430+} 12431+ 12432+static inline int ElementMulReluIntSSE(int index, const int *in0, const int *in1, int *out, int size) { 12433+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12434+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 12435+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 12436+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f); 12437+ SIMD_ST_EPI32(out + index, vout); 12438+ } 12439+ return index; 12440+} 12441+ 12442+static inline int ElementMulRelu6IntSSE(int index, const int *in0, const int *in1, int *out, int size) { 12443+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12444+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 12445+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 12446+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f); 12447+ SIMD_ST_EPI32(out + index, vout); 12448+ } 12449+ return index; 12450+} 12451+ 12452+static inline int ElementOptMulNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { 12453+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 12454+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12455+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 12456+ SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1); 12457+ SIMD_ST_F32(out + index, vout); 12458+ } 12459+ return index; 12460+} 12461+ 12462+static inline int ElementOptMulNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { 12463+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 12464+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12465+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 12466+ SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_); 12467+ SIMD_ST_F32(out + index, vout); 12468+ } 12469+ return index; 12470+} 12471+ 12472+static inline int ElementOptMulReluNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { 12473+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 12474+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12475+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 12476+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f); 12477+ SIMD_ST_F32(out + index, vout); 12478+ } 12479+ return index; 12480+} 12481+ 12482+static inline int ElementOptMulReluNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { 12483+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 12484+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12485+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 12486+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f); 12487+ SIMD_ST_F32(out + index, vout); 12488+ } 12489+ return index; 12490+} 12491+ 12492+static inline int ElementOptMulRelu6Num0SSE(int index, const float *in0, const float *in1, float *out, int size) { 12493+ SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); 12494+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12495+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 12496+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f); 12497+ SIMD_ST_F32(out + index, vout); 12498+ } 12499+ return index; 12500+} 12501+ 12502+static inline int ElementOptMulRelu6Num1SSE(int index, const float *in0, const float *in1, float *out, int size) { 12503+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 12504+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12505+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 12506+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f); 12507+ SIMD_ST_F32(out + index, vout); 12508+ } 12509+ return index; 12510+} 12511+ 12512+static inline int ElementOptMulIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { 12513+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 12514+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12515+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 12516+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1); 12517+ SIMD_ST_EPI32(out + index, vout); 12518+ } 12519+ return index; 12520+} 12521+ 12522+static inline int ElementOptMulIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { 12523+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 12524+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12525+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 12526+ SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_); 12527+ SIMD_ST_EPI32(out + index, vout); 12528+ } 12529+ return index; 12530+} 12531+ 12532+static inline int ElementOptMulReluIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { 12533+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 12534+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12535+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 12536+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f); 12537+ SIMD_ST_EPI32(out + index, vout); 12538+ } 12539+ return index; 12540+} 12541+ 12542+static inline int ElementOptMulReluIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { 12543+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 12544+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12545+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 12546+ SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f); 12547+ SIMD_ST_EPI32(out + index, vout); 12548+ } 12549+ return index; 12550+} 12551+ 12552+static inline int ElementOptMulRelu6IntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { 12553+ SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); 12554+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12555+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 12556+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f); 12557+ SIMD_ST_EPI32(out + index, vout); 12558+ } 12559+ return index; 12560+} 12561+ 12562+static inline int ElementOptMulRelu6IntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { 12563+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 12564+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12565+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 12566+ SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f); 12567+ SIMD_ST_EPI32(out + index, vout); 12568+ } 12569+ return index; 12570+} 12571+ 12572+#undef MS_SIMD_INSTRUCTION 12573+#undef BLOCK_NUM 12574+#pragma GCC pop_options 12575+#undef MS_SIMD_SSE 12576+#ifdef __cplusplus 12577+} 12578+#endif 12579+#endif 12580diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h 12581new file mode 100644 12582index 00000000..ad9239fd 12583--- /dev/null 12584+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h 12585@@ -0,0 +1,84 @@ 12586+/** 12587+ * Copyright 2022 Huawei Technologies Co., Ltd 12588+ * 12589+ * Licensed under the Apache License, Version 2.0 (the "License"); 12590+ * you may not use this file except in compliance with the License. 12591+ * You may obtain a copy of the License at 12592+ * 12593+ * http://www.apache.org/licenses/LICENSE-2.0 12594+ * 12595+ * Unless required by applicable law or agreed to in writing, software 12596+ * distributed under the License is distributed on an "AS IS" BASIS, 12597+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12598+ * See the License for the specific language governing permissions and 12599+ * limitations under the License. 12600+ */ 12601+#ifndef MINDSPORE_NNACL_FP32_POOLING_SSE_H_ 12602+#define MINDSPORE_NNACL_FP32_POOLING_SSE_H_ 12603+ 12604+#include "nnacl/intrinsics/ms_simd_instructions.h" 12605+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 12606+ 12607+#ifdef __cplusplus 12608+extern "C" { 12609+#endif 12610+#pragma GCC push_options 12611+#pragma GCC target("sse4.1") 12612+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12613+#define BLOCK_NUM 4 12614+#define MS_SIMD_SSE 12615+ 12616+static inline int AvgPoolingBatchSSE(int ci, const float *src_plane_ptr, int channel, 12617+ float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, 12618+ int in_h_index, int in_w, int in_w_index, float minf, float maxf) { 12619+ SIMD_F32 min_val = SIMD_MOV_F32(minf); 12620+ SIMD_F32 max_val = SIMD_MOV_F32(maxf); 12621+ for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { 12622+ const float *src_c_ptr = src_plane_ptr + ci; 12623+ float *dst_c_ptr = dst_plane_ptr + ci; 12624+ SIMD_F32 tmp_avg = SIMD_SET0_F32; 12625+ int real_count = 0; 12626+ for (int h = real_win_h_start; h < real_win_h_end; h++) { 12627+ for (int w = real_win_w_start; w < real_win_w_end; w++) { 12628+ const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; 12629+ tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr)); 12630+ ++real_count; 12631+ } 12632+ } 12633+ tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count)); 12634+ tmp_avg = SIMD_MAX_F32(tmp_avg, min_val); 12635+ tmp_avg = SIMD_MIN_F32(tmp_avg, max_val); 12636+ SIMD_ST_F32(dst_c_ptr, tmp_avg); 12637+ } 12638+ return ci; 12639+} 12640+ 12641+static inline int MaxPoolingBatchSSE(int ci, const float *src_plane_ptr, int channel, 12642+ float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, 12643+ int in_h_index, int in_w, int in_w_index, float minf, float maxf) { 12644+ SIMD_F32 min_val = SIMD_MOV_F32(minf); 12645+ SIMD_F32 max_val = SIMD_MOV_F32(maxf); 12646+ for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { 12647+ const float *src_c_ptr = src_plane_ptr + ci; 12648+ float *dst_c_ptr = dst_plane_ptr + ci; 12649+ SIMD_F32 tmp_max = min_val; 12650+ for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { 12651+ for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { 12652+ const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; 12653+ tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr)); 12654+ } 12655+ } 12656+ tmp_max = SIMD_MIN_F32(tmp_max, max_val); 12657+ SIMD_ST_F32(dst_c_ptr, tmp_max); 12658+ } 12659+ return ci; 12660+} 12661+ 12662+#undef MS_SIMD_INSTRUCTION 12663+#undef BLOCK_NUM 12664+#pragma GCC pop_options 12665+#undef MS_SIMD_SSE 12666+#ifdef __cplusplus 12667+} 12668+#endif 12669+#endif 12670diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h 12671new file mode 100644 12672index 00000000..4c46310e 12673--- /dev/null 12674+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h 12675@@ -0,0 +1,101 @@ 12676+/** 12677+ * Copyright 2022 Huawei Technologies Co., Ltd 12678+ * 12679+ * Licensed under the Apache License, Version 2.0 (the "License"); 12680+ * you may not use this file except in compliance with the License. 12681+ * You may obtain a copy of the License at 12682+ * 12683+ * http://www.apache.org/licenses/LICENSE-2.0 12684+ * 12685+ * Unless required by applicable law or agreed to in writing, software 12686+ * distributed under the License is distributed on an "AS IS" BASIS, 12687+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12688+ * See the License for the specific language governing permissions and 12689+ * limitations under the License. 12690+ */ 12691+#ifndef MINDSPORE_NNACL_FP32_POWER_SSE_H_ 12692+#define MINDSPORE_NNACL_FP32_POWER_SSE_H_ 12693+ 12694+#include "nnacl/intrinsics/ms_simd_instructions.h" 12695+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 12696+ 12697+#ifdef __cplusplus 12698+extern "C" { 12699+#endif 12700+#pragma GCC push_options 12701+#pragma GCC target("sse4.1") 12702+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12703+#define BLOCK_NUM 4 12704+#define MS_SIMD_SSE 12705+ 12706+static inline int PowerBroadCastIntExponentSSE(int index, const float *input, int exponent, float *output, int len, 12707+ float scale, float shift) { 12708+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 12709+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 12710+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12711+ SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 12712+ SIMD_F32 result = SIMD_MOV_F32(1.0f); 12713+ int exp = abs(exponent); 12714+ while (exp) { 12715+ if (exp % 2) { 12716+ result = SIMD_MUL_F32(result, tmp); 12717+ } 12718+ tmp = SIMD_MUL_SQUARE_F32(tmp); 12719+ exp = exp / 2; 12720+ } 12721+ SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result)); 12722+ } 12723+ return index; 12724+} 12725+ 12726+static inline int PowerBroadCastFloatExponentSSE(int index, const float *input, float exponent, float *output, int len, 12727+ float scale, float shift) { 12728+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 12729+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 12730+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12731+ SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 12732+ SIMD_F32 result; 12733+ for (int i = 0; i < BLOCK_NUM; ++i) { 12734+ SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent); 12735+ } 12736+ SIMD_ST_F32(output + index, result); 12737+ } 12738+ return index; 12739+} 12740+ 12741+static inline int PowerSingleExponentSSE(int index, const float *input, const float *exponent, float *output, int len, 12742+ float scale, float shift) { 12743+ SIMD_F32 scale_vec = SIMD_MOV_F32(scale); 12744+ SIMD_F32 shift_vec = SIMD_MOV_F32(shift); 12745+ for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12746+ SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); 12747+ for (int j = 0; j < BLOCK_NUM; ++j) { 12748+ float cur_exponent = exponent[index + j]; 12749+ float cur_val = SIMD_F32_GETI(tmp_vec, j); 12750+ if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) { 12751+ int exp = abs((int)(cur_exponent)); 12752+ float result = 1; 12753+ while (exp) { 12754+ if (exp % 2) { 12755+ result *= cur_val; 12756+ } 12757+ cur_val *= cur_val; 12758+ exp = exp / 2; 12759+ } 12760+ output[index + j] = *exponent >= 0 ? result : 1 / result; 12761+ } else { 12762+ output[index + j] = powf(cur_val, cur_exponent); 12763+ } 12764+ } 12765+ } 12766+ return index; 12767+} 12768+ 12769+#undef MS_SIMD_INSTRUCTION 12770+#undef BLOCK_NUM 12771+#pragma GCC pop_options 12772+#undef MS_SIMD_SSE 12773+#ifdef __cplusplus 12774+} 12775+#endif 12776+#endif 12777diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h 12778new file mode 100644 12779index 00000000..936a5d51 12780--- /dev/null 12781+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h 12782@@ -0,0 +1,181 @@ 12783+/** 12784+ * Copyright 2022 Huawei Technologies Co., Ltd 12785+ * 12786+ * Licensed under the Apache License, Version 2.0 (the "License"); 12787+ * you may not use this file except in compliance with the License. 12788+ * You may obtain a copy of the License at 12789+ * 12790+ * http://www.apache.org/licenses/LICENSE-2.0 12791+ * 12792+ * Unless required by applicable law or agreed to in writing, software 12793+ * distributed under the License is distributed on an "AS IS" BASIS, 12794+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12795+ * See the License for the specific language governing permissions and 12796+ * limitations under the License. 12797+ */ 12798+#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_SSE_H_ 12799+#define MINDSPORE_NNACL_FP32_REDUCE_FP32_SSE_H_ 12800+ 12801+#include "nnacl/intrinsics/ms_simd_instructions.h" 12802+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 12803+ 12804+#ifdef __cplusplus 12805+extern "C" { 12806+#endif 12807+#pragma GCC push_options 12808+#pragma GCC target("sse4.1") 12809+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12810+#define BLOCK_NUM 4 12811+#define MS_SIMD_SSE 12812+ 12813+static inline int64_t ReduceSumSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 12814+ int axis_size) { 12815+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12816+ const float *inner_src = outer_src + index; 12817+ SIMD_F32 tmp = SIMD_MOV_F32(0); 12818+ for (int i = 0; i < axis_size; i++) { 12819+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 12820+ } 12821+ SIMD_ST_F32(outer_dst + index, tmp); 12822+ } 12823+ return index; 12824+} 12825+ 12826+static inline int64_t ReduceMeanSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 12827+ int axis_size) { 12828+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12829+ const float *inner_src = outer_src + index; 12830+ SIMD_F32 tmp = SIMD_MOV_F32(0); 12831+ for (int i = 0; i < axis_size; i++) { 12832+ tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 12833+ } 12834+ SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size)); 12835+ } 12836+ return index; 12837+} 12838+ 12839+static inline int64_t ReduceMinSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 12840+ int axis_size) { 12841+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12842+ const float *inner_src = outer_src + index; 12843+ SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX); 12844+ for (int i = 0; i < axis_size; i++) { 12845+ tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 12846+ } 12847+ SIMD_ST_F32(outer_dst + index, tmp); 12848+ } 12849+ return index; 12850+} 12851+ 12852+static inline int64_t ReduceMaxSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 12853+ int axis_size) { 12854+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12855+ const float *inner_src = outer_src + index; 12856+ SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN); 12857+ for (int i = 0; i < axis_size; i++) { 12858+ tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 12859+ } 12860+ SIMD_ST_F32(outer_dst + index, tmp); 12861+ } 12862+ return index; 12863+} 12864+ 12865+static inline int64_t ReduceProdSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 12866+ int axis_size) { 12867+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12868+ const float *inner_src = outer_src + index; 12869+ SIMD_F32 tmp = SIMD_MOV_F32(1.0f); 12870+ for (int i = 0; i < axis_size; i++) { 12871+ tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); 12872+ } 12873+ SIMD_ST_F32(outer_dst + index, tmp); 12874+ } 12875+ return index; 12876+} 12877+ 12878+static inline int64_t ReduceSumSquareSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 12879+ int axis_size) { 12880+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12881+ const float *inner_src = outer_src + index; 12882+ SIMD_F32 tmp = SIMD_MOV_F32(0); 12883+ for (int i = 0; i < axis_size; i++) { 12884+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); 12885+ } 12886+ SIMD_ST_F32(outer_dst + index, tmp); 12887+ } 12888+ return index; 12889+} 12890+ 12891+static inline int64_t ReduceL2NormSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, 12892+ int axis_size) { 12893+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12894+ const float *inner_src = outer_src + index; 12895+ SIMD_F32 tmp = SIMD_MOV_F32(0); 12896+ for (int i = 0; i < axis_size; i++) { 12897+ tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); 12898+ } 12899+ SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp)); 12900+ } 12901+ return index; 12902+} 12903+ 12904+static inline int64_t IntReduceSumSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 12905+ int axis_size) { 12906+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12907+ const int *inner_src = outer_src + index; 12908+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); 12909+ for (int i = 0; i < axis_size; i++) { 12910+ tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 12911+ } 12912+ SIMD_ST_EPI32(outer_dst + index, tmp); 12913+ } 12914+ return index; 12915+} 12916+ 12917+static inline int64_t IntReduceMeanSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 12918+ int axis_size) { 12919+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12920+ const int *inner_src = outer_src + index; 12921+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); 12922+ for (int i = 0; i < axis_size; i++) { 12923+ tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 12924+ } 12925+ SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size)); 12926+ } 12927+ return index; 12928+} 12929+ 12930+static inline int64_t IntReduceMinSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 12931+ int axis_size) { 12932+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12933+ const int *inner_src = outer_src + index; 12934+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX); 12935+ for (int i = 0; i < axis_size; i++) { 12936+ tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 12937+ } 12938+ SIMD_ST_EPI32(outer_dst + index, tmp); 12939+ } 12940+ return index; 12941+} 12942+ 12943+static inline int64_t IntReduceMaxSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size, 12944+ int axis_size) { 12945+ for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 12946+ const int *inner_src = outer_src + index; 12947+ SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN); 12948+ for (int i = 0; i < axis_size; i++) { 12949+ tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); 12950+ } 12951+ SIMD_ST_EPI32(outer_dst + index, tmp); 12952+ } 12953+ return index; 12954+} 12955+ 12956+#undef MS_SIMD_INSTRUCTION 12957+#undef BLOCK_NUM 12958+#pragma GCC pop_options 12959+#undef MS_SIMD_SSE 12960+#ifdef __cplusplus 12961+} 12962+#endif 12963+#endif 12964diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h 12965new file mode 100644 12966index 00000000..71c89ebc 12967--- /dev/null 12968+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h 12969@@ -0,0 +1,87 @@ 12970+/** 12971+ * Copyright 2022 Huawei Technologies Co., Ltd 12972+ * 12973+ * Licensed under the Apache License, Version 2.0 (the "License"); 12974+ * you may not use this file except in compliance with the License. 12975+ * You may obtain a copy of the License at 12976+ * 12977+ * http://www.apache.org/licenses/LICENSE-2.0 12978+ * 12979+ * Unless required by applicable law or agreed to in writing, software 12980+ * distributed under the License is distributed on an "AS IS" BASIS, 12981+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12982+ * See the License for the specific language governing permissions and 12983+ * limitations under the License. 12984+ */ 12985+ 12986+#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_SSE_H_ 12987+#define MINDSPORE_NNACL_FP32_SOFTMAX_SSE_H_ 12988+ 12989+#include "nnacl/intrinsics/ms_simd_instructions.h" 12990+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 12991+ 12992+#ifdef __cplusplus 12993+extern "C" { 12994+#endif 12995+#pragma GCC push_options 12996+#pragma GCC target("sse4.1") 12997+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 12998+#define BLOCK_NUM 4 12999+#define MS_SIMD_SSE 13000+ 13001+static inline int64_t SoftmaxNormGetMaxSSE(int64_t index, const float *src, int cur_batch_offset, 13002+ float *max, int channel) { 13003+ if (channel >= BLOCK_NUM * BLOCK_NUM) { 13004+ SIMD_F32 max_val = SIMD_MOV_F32(*max); 13005+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13006+ max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index)); 13007+ } 13008+ *max = SIMD_GET_MAX_F32(max_val); 13009+ } 13010+ return index; 13011+} 13012+ 13013+static inline int64_t SoftmaxNormCalcNormSSE(int64_t index, const float *src, float *dst, 13014+ int cur_batch_offset, float max, int channel) { 13015+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13016+ SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max)); 13017+ SIMD_ST_F32(dst + cur_batch_offset + index, output); 13018+ } 13019+ return index; 13020+} 13021+ 13022+static inline int64_t SoftmaxLastAxisGetExpSumSSE(int64_t index, const float *src, float *dst, 13023+ int cur_batch_offset, float max, float *exp_sum, int channel) { 13024+#ifndef _WIN32 13025+ SIMD_F32 sum_val = SIMD_SET0_F32; 13026+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13027+ SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); 13028+ SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max)); 13029+ SIMD_F32 exp_out = SIMD_EXP_F32(output); 13030+ sum_val = SIMD_ADD_F32(sum_val, exp_out); 13031+ SIMD_ST_F32(dst + cur_batch_offset + index, exp_out); 13032+ } 13033+ *exp_sum += SIMD_GET_SUM_F32(sum_val); 13034+#endif 13035+ return index; 13036+} 13037+ 13038+static inline int64_t SoftmaxLastAxisGetResultSSE(int64_t index, const float *src, float *dst, 13039+ int cur_batch_offset, float exp_sum, int channel) { 13040+ SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum); 13041+ for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13042+ SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); 13043+ SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val); 13044+ SIMD_ST_F32(dst + cur_batch_offset + index, output); 13045+ } 13046+ return index; 13047+} 13048+ 13049+#undef MS_SIMD_INSTRUCTION 13050+#undef BLOCK_NUM 13051+#pragma GCC pop_options 13052+#undef MS_SIMD_SSE 13053+#ifdef __cplusplus 13054+}; 13055+#endif 13056+#endif 13057diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h 13058new file mode 100644 13059index 00000000..a6197e19 13060--- /dev/null 13061+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h 13062@@ -0,0 +1,167 @@ 13063+/** 13064+ * Copyright 2022 Huawei Technologies Co., Ltd 13065+ * 13066+ * Licensed under the Apache License, Version 2.0 (the "License"); 13067+ * you may not use this file except in compliance with the License. 13068+ * You may obtain a copy of the License at 13069+ * 13070+ * http://www.apache.org/licenses/LICENSE-2.0 13071+ * 13072+ * Unless required by applicable law or agreed to in writing, software 13073+ * distributed under the License is distributed on an "AS IS" BASIS, 13074+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13075+ * See the License for the specific language governing permissions and 13076+ * limitations under the License. 13077+ */ 13078+ 13079+#ifndef MINDSPORE_NNACL_FP32_SUB_SSE_H_ 13080+#define MINDSPORE_NNACL_FP32_SUB_SSE_H_ 13081+ 13082+#include "nnacl/intrinsics/ms_simd_instructions.h" 13083+#include "nnacl/intrinsics/ms_simd_sse_instructions.h" 13084+ 13085+#ifdef __cplusplus 13086+extern "C" { 13087+#endif 13088+#pragma GCC push_options 13089+#pragma GCC target("sse4.1") 13090+#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION 13091+#define BLOCK_NUM 4 13092+#define MS_SIMD_SSE 13093+ 13094+static inline int ElementOptSubNum0SSE(int index, const float *in0, const float *in1, float *out, 13095+ int size) { 13096+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 13097+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13098+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 13099+ SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1); 13100+ SIMD_ST_F32(out + index, vout); 13101+ } 13102+ return index; 13103+} 13104+ 13105+static inline int ElementOptSubNum1SSE(int index, const float *in0, const float *in1, float *out, 13106+ int size) { 13107+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 13108+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13109+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 13110+ SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_); 13111+ SIMD_ST_F32(out + index, vout); 13112+ } 13113+ return index; 13114+} 13115+ 13116+static inline int ElementOptSubIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { 13117+ SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); 13118+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13119+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 13120+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1); 13121+ SIMD_ST_EPI32(out + index, vout); 13122+ } 13123+ return index; 13124+} 13125+ 13126+static inline int ElementOptSubIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { 13127+ SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); 13128+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13129+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 13130+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_); 13131+ SIMD_ST_EPI32(out + index, vout); 13132+ } 13133+ return index; 13134+} 13135+ 13136+static inline int ElementOptSubReluNum0SSE(int index, const float *in0, const float *in1, float *out, 13137+ int size) { 13138+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 13139+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13140+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 13141+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f); 13142+ SIMD_ST_F32(out + index, vout); 13143+ } 13144+ return index; 13145+} 13146+ 13147+static inline int ElementOptSubReluNum1SSE(int index, const float *in0, const float *in1, float *out, 13148+ int size) { 13149+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 13150+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13151+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 13152+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f); 13153+ SIMD_ST_F32(out + index, vout); 13154+ } 13155+ return index; 13156+} 13157+ 13158+static inline int ElementOptSubRelu6Num0SSE(int index, const float *in0, const float *in1, float *out, 13159+ int size) { 13160+ SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); 13161+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13162+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 13163+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f); 13164+ SIMD_ST_F32(out + index, vout); 13165+ } 13166+ return index; 13167+} 13168+ 13169+static inline int ElementOptSubRelu6Num1SSE(int index, const float *in0, const float *in1, float *out, 13170+ int size) { 13171+ SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); 13172+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13173+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 13174+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f); 13175+ SIMD_ST_F32(out + index, vout); 13176+ } 13177+ return index; 13178+} 13179+ 13180+static inline int ElementSubSSE(int index, const float *in0, const float *in1, float *out, int size) { 13181+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13182+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 13183+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 13184+ SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1); 13185+ SIMD_ST_F32(out + index, vout); 13186+ } 13187+ return index; 13188+} 13189+ 13190+static inline int ElementSubIntSSE(int index, const int *in0, const int *in1, int *out, int size) { 13191+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13192+ SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); 13193+ SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); 13194+ SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1); 13195+ SIMD_ST_EPI32(out + index, vout); 13196+ } 13197+ return index; 13198+} 13199+ 13200+static inline int ElementSubReluSSE(int index, const float *in0, const float *in1, float *out, 13201+ int size) { 13202+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13203+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 13204+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 13205+ SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f); 13206+ SIMD_ST_F32(out + index, vout); 13207+ } 13208+ return index; 13209+} 13210+ 13211+static inline int ElementSubRelu6SSE(int index, const float *in0, const float *in1, float *out, 13212+ int size) { 13213+ for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { 13214+ SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); 13215+ SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); 13216+ SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f); 13217+ SIMD_ST_F32(out + index, vout); 13218+ } 13219+ return index; 13220+} 13221+ 13222+#undef MS_SIMD_INSTRUCTION 13223+#undef BLOCK_NUM 13224+#pragma GCC pop_options 13225+#undef MS_SIMD_SSE 13226+#ifdef __cplusplus 13227+}; 13228+#endif 13229+#endif 13230diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h 13231new file mode 100644 13232index 00000000..894f5d7c 13233--- /dev/null 13234+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h 13235@@ -0,0 +1,36 @@ 13236+/** 13237+ * Copyright 2022 Huawei Technologies Co., Ltd 13238+ * 13239+ * Licensed under the Apache License, Version 2.0 (the "License"); 13240+ * you may not use this file except in compliance with the License. 13241+ * You may obtain a copy of the License at 13242+ * 13243+ * http://www.apache.org/licenses/LICENSE-2.0 13244+ * 13245+ * Unless required by applicable law or agreed to in writing, software 13246+ * distributed under the License is distributed on an "AS IS" BASIS, 13247+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13248+ * See the License for the specific language governing permissions and 13249+ * limitations under the License. 13250+ */ 13251+#ifndef MINDSPORE_NNACL_SUB_FP32_SIMD_H_ 13252+#define MINDSPORE_NNACL_SUB_FP32_SIMD_H_ 13253+ 13254+#include "nnacl/intrinsics/ms_simd_instructions.h" 13255+#ifdef ENABLE_AVX512 13256+#include "nnacl/avx512/sub_fp32_avx512.h" 13257+#endif 13258+ 13259+#ifdef ENABLE_AVX 13260+#include "nnacl/avx/sub_fp32_avx.h" 13261+#endif 13262+ 13263+#ifdef ENABLE_SSE 13264+#include "nnacl/sse/sub_fp32_sse.h" 13265+#endif 13266+ 13267+#ifdef ENABLE_ARM 13268+#include "nnacl/neon/sub_fp32_neon.h" 13269+#endif 13270+ 13271+#endif 13272-- 132732.34.1 13274 13275