From e6e35ad9f7b4c0c99d2f9b62c7d199dd3bf487dc Mon Sep 17 00:00:00 2001 From: Zhu Guodong Date: Mon, 6 Mar 2023 16:02:57 +0800 Subject: [PATCH 2/4] generate nnacl simd headers manually --- .../include/nnacl/activation_fp32_simd.h | 36 +++ .../include/nnacl/activation_grad_simd.h | 36 +++ .../nnacl/include/nnacl/adam_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/add_fp32_simd.h | 36 +++ .../include/nnacl/arithmetic_fp32_simd.h | 36 +++ .../include/nnacl/arithmetic_self_fp32_simd.h | 36 +++ .../include/nnacl/avx/activation_fp32_avx.h | 221 +++++++++++++++ .../include/nnacl/avx/activation_grad_avx.h | 57 ++++ .../nnacl/include/nnacl/avx/adam_fp32_avx.h | 210 +++++++++++++++ .../nnacl/include/nnacl/avx/add_fp32_avx.h | 124 +++++++++ .../include/nnacl/avx/arithmetic_fp32_avx.h | 254 ++++++++++++++++++ .../nnacl/avx/arithmetic_self_fp32_avx.h | 129 +++++++++ .../include/nnacl/avx/batchnorm_fp32_avx.h | 67 +++++ .../nnacl/avx/bce_with_logits_loss_fp32_avx.h | 69 +++++ .../nnacl/include/nnacl/avx/bias_add_avx.h | 64 +++++ .../nnacl/include/nnacl/avx/cast_base_avx.h | 56 ++++ .../nnacl/include/nnacl/avx/cdist_fp32_avx.h | 70 +++++ .../nnacl/include/nnacl/avx/cumsum_fp32_avx.h | 121 +++++++++ .../nnacl/include/nnacl/avx/div_fp32_avx.h | 167 ++++++++++++ .../include/nnacl/avx/dropout_fp32_avx.h | 46 ++++ .../nnacl/include/nnacl/avx/exp_fp32_avx.h | 63 +++++ .../nnacl/include/nnacl/avx/fill_base_avx.h | 53 ++++ .../include/nnacl/avx/group_norm_fp32_avx.h | 77 ++++++ .../include/nnacl/avx/layer_norm_fp32_avx.h | 68 +++++ .../nnacl/include/nnacl/avx/matmul_fp32_avx.h | 93 +++++++ .../nnacl/include/nnacl/avx/mul_fp32_avx.h | 218 +++++++++++++++ .../include/nnacl/avx/pooling_fp32_avx.h | 84 ++++++ .../nnacl/include/nnacl/avx/power_fp32_avx.h | 101 +++++++ .../nnacl/include/nnacl/avx/reduce_fp32_avx.h | 181 +++++++++++++ .../include/nnacl/avx/softmax_fp32_avx.h | 87 ++++++ .../nnacl/include/nnacl/avx/sub_fp32_avx.h | 167 ++++++++++++ .../nnacl/avx512/activation_fp32_avx512.h | 221 +++++++++++++++ .../nnacl/avx512/activation_grad_avx512.h | 57 ++++ .../include/nnacl/avx512/adam_fp32_avx512.h | 210 +++++++++++++++ .../include/nnacl/avx512/add_fp32_avx512.h | 124 +++++++++ .../nnacl/avx512/arithmetic_fp32_avx512.h | 254 ++++++++++++++++++ .../avx512/arithmetic_self_fp32_avx512.h | 129 +++++++++ .../nnacl/avx512/batchnorm_fp32_avx512.h | 67 +++++ .../avx512/bce_with_logits_loss_fp32_avx512.h | 69 +++++ .../include/nnacl/avx512/bias_add_avx512.h | 64 +++++ .../include/nnacl/avx512/cast_base_avx512.h | 56 ++++ .../include/nnacl/avx512/cdist_fp32_avx512.h | 70 +++++ .../include/nnacl/avx512/cumsum_fp32_avx512.h | 121 +++++++++ .../include/nnacl/avx512/div_fp32_avx512.h | 167 ++++++++++++ .../nnacl/avx512/dropout_fp32_avx512.h | 46 ++++ .../include/nnacl/avx512/exp_fp32_avx512.h | 63 +++++ .../include/nnacl/avx512/fill_base_avx512.h | 53 ++++ .../nnacl/avx512/group_norm_fp32_avx512.h | 77 ++++++ .../nnacl/avx512/layer_norm_fp32_avx512.h | 68 +++++ .../include/nnacl/avx512/matmul_fp32_avx512.h | 93 +++++++ .../include/nnacl/avx512/mul_fp32_avx512.h | 218 +++++++++++++++ .../nnacl/avx512/pooling_fp32_avx512.h | 84 ++++++ .../include/nnacl/avx512/power_fp32_avx512.h | 101 +++++++ .../include/nnacl/avx512/reduce_fp32_avx512.h | 181 +++++++++++++ .../nnacl/avx512/softmax_fp32_avx512.h | 87 ++++++ .../include/nnacl/avx512/sub_fp32_avx512.h | 167 ++++++++++++ .../nnacl/include/nnacl/batchnorm_fp32_simd.h | 36 +++ .../nnacl/bce_with_logits_loss_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/bias_add_simd.h | 36 +++ .../nnacl/include/nnacl/cast_base_simd.h | 36 +++ .../nnacl/include/nnacl/cdist_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/cumsum_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/div_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/dropout_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/exp_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/fill_base_simd.h | 36 +++ .../include/nnacl/group_norm_fp32_simd.h | 36 +++ .../include/nnacl/layer_norm_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/matmul_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/mul_fp32_simd.h | 36 +++ .../include/nnacl/neon/activation_fp32_neon.h | 220 +++++++++++++++ .../include/nnacl/neon/activation_grad_neon.h | 56 ++++ .../nnacl/include/nnacl/neon/adam_fp32_neon.h | 209 ++++++++++++++ .../nnacl/include/nnacl/neon/add_fp32_neon.h | 123 +++++++++ .../include/nnacl/neon/arithmetic_fp32_neon.h | 253 +++++++++++++++++ .../nnacl/neon/arithmetic_self_fp32_neon.h | 128 +++++++++ .../include/nnacl/neon/batchnorm_fp32_neon.h | 66 +++++ .../neon/bce_with_logits_loss_fp32_neon.h | 68 +++++ .../nnacl/include/nnacl/neon/bias_add_neon.h | 63 +++++ .../nnacl/include/nnacl/neon/cast_base_neon.h | 55 ++++ .../include/nnacl/neon/cdist_fp32_neon.h | 69 +++++ .../include/nnacl/neon/cumsum_fp32_neon.h | 120 +++++++++ .../nnacl/include/nnacl/neon/div_fp32_neon.h | 166 ++++++++++++ .../include/nnacl/neon/dropout_fp32_neon.h | 45 ++++ .../nnacl/include/nnacl/neon/exp_fp32_neon.h | 62 +++++ .../nnacl/include/nnacl/neon/fill_base_neon.h | 52 ++++ .../include/nnacl/neon/group_norm_fp32_neon.h | 76 ++++++ .../include/nnacl/neon/layer_norm_fp32_neon.h | 67 +++++ .../include/nnacl/neon/matmul_fp32_neon.h | 92 +++++++ .../nnacl/include/nnacl/neon/mul_fp32_neon.h | 217 +++++++++++++++ .../include/nnacl/neon/pooling_fp32_neon.h | 83 ++++++ .../include/nnacl/neon/power_fp32_neon.h | 100 +++++++ .../include/nnacl/neon/reduce_fp32_neon.h | 180 +++++++++++++ .../include/nnacl/neon/softmax_fp32_neon.h | 86 ++++++ .../nnacl/include/nnacl/neon/sub_fp32_neon.h | 166 ++++++++++++ .../nnacl/include/nnacl/pooling_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/power_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/reduce_fp32_simd.h | 36 +++ .../nnacl/include/nnacl/softmax_fp32_simd.h | 36 +++ .../include/nnacl/sse/activation_fp32_sse.h | 221 +++++++++++++++ .../include/nnacl/sse/activation_grad_sse.h | 57 ++++ .../nnacl/include/nnacl/sse/adam_fp32_sse.h | 210 +++++++++++++++ .../nnacl/include/nnacl/sse/add_fp32_sse.h | 124 +++++++++ .../include/nnacl/sse/arithmetic_fp32_sse.h | 254 ++++++++++++++++++ .../nnacl/sse/arithmetic_self_fp32_sse.h | 129 +++++++++ .../include/nnacl/sse/batchnorm_fp32_sse.h | 67 +++++ .../nnacl/sse/bce_with_logits_loss_fp32_sse.h | 69 +++++ .../nnacl/include/nnacl/sse/bias_add_sse.h | 64 +++++ .../nnacl/include/nnacl/sse/cast_base_sse.h | 56 ++++ .../nnacl/include/nnacl/sse/cdist_fp32_sse.h | 70 +++++ .../nnacl/include/nnacl/sse/cumsum_fp32_sse.h | 121 +++++++++ .../nnacl/include/nnacl/sse/div_fp32_sse.h | 167 ++++++++++++ .../include/nnacl/sse/dropout_fp32_sse.h | 46 ++++ .../nnacl/include/nnacl/sse/exp_fp32_sse.h | 63 +++++ .../nnacl/include/nnacl/sse/fill_base_sse.h | 53 ++++ .../include/nnacl/sse/group_norm_fp32_sse.h | 77 ++++++ .../include/nnacl/sse/layer_norm_fp32_sse.h | 68 +++++ .../nnacl/include/nnacl/sse/matmul_fp32_sse.h | 93 +++++++ .../nnacl/include/nnacl/sse/mul_fp32_sse.h | 218 +++++++++++++++ .../include/nnacl/sse/pooling_fp32_sse.h | 84 ++++++ .../nnacl/include/nnacl/sse/power_fp32_sse.h | 101 +++++++ .../nnacl/include/nnacl/sse/reduce_fp32_sse.h | 181 +++++++++++++ .../include/nnacl/sse/softmax_fp32_sse.h | 87 ++++++ .../nnacl/include/nnacl/sse/sub_fp32_sse.h | 167 ++++++++++++ .../nnacl/include/nnacl/sub_fp32_simd.h | 36 +++ 125 files changed, 12263 insertions(+) create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h new file mode 100644 index 00000000..fead4fd3 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_ACTIVATION_FP32_SIMD_H_ +#define MINDSPORE_NNACL_ACTIVATION_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/activation_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/activation_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/activation_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/activation_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h new file mode 100644 index 00000000..c8637379 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/activation_grad_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_ACTIVATION_GRAD_SIMD_H_ +#define MINDSPORE_NNACL_ACTIVATION_GRAD_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/activation_grad_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/activation_grad_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/activation_grad_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/activation_grad_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h new file mode 100644 index 00000000..267799ed --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/adam_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_ADAM_FP32_SIMD_H_ +#define MINDSPORE_NNACL_ADAM_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/adam_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/adam_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/adam_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/adam_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h new file mode 100644 index 00000000..83cd76ec --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/add_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_ADD_FP32_SIMD_H_ +#define MINDSPORE_NNACL_ADD_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/add_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/add_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/add_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/add_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h new file mode 100644 index 00000000..898fe882 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_ARITHMETIC_FP32_SIMD_H_ +#define MINDSPORE_NNACL_ARITHMETIC_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/arithmetic_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/arithmetic_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/arithmetic_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/arithmetic_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h new file mode 100644 index 00000000..676b53ec --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/arithmetic_self_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_FP32_SIMD_H_ +#define MINDSPORE_NNACL_ARITHMETIC_SELF_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/arithmetic_self_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/arithmetic_self_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/arithmetic_self_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/arithmetic_self_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h new file mode 100644 index 00000000..49edf7ec --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_fp32_avx.h @@ -0,0 +1,221 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int Fp32ReluAVX(int index, const float *src, int length, float *dst) { + SIMD_F32 zero = SIMD_SET0_F32; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero)); + } + return index; +} + +static inline int Int32ReluAVX(int index, const int32_t *src, int length, int32_t *dst) { + SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero)); + } + return index; +} + +static inline int Fp32Relu6AVX(int index, const float *src, int length, float *dst) { + SIMD_F32 zero = SIMD_SET0_F32; + SIMD_F32 six = SIMD_MOV_F32(6.0f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six)); + } + return index; +} + +static inline int LReluAVX(int index, const float *src, int length, float *dst, float alpha) { + SIMD_F32 alpha_data = SIMD_MOV_F32(alpha); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask)); + } + return index; +} + +static inline int SigmoidAVX(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index); + SIMD_ST_F32(dst + index, + SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); + } + return index; +} + +static inline int TanhAVX(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + index); + SIMD_ST_F32(dst + index, SIMD_TANH_F32(input)); + } + return index; +} + +static inline int SwishAVX(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index); + SIMD_ST_F32(dst + index, + SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); + } + return index; +} + +static inline int HSwishAVX(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); + SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6)); + } + return index; +} + +static inline int HSigmoidAVX(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); + SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6)); + } + return index; +} + +static inline int HardTanhNoLimitMinAVX(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val)); + } + return index; +} + +static inline int HardTanhNoLimitMaxAVX(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val)); + } + return index; +} + +static inline int HardTanhLimitMinMaxAVX(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val)); + } + return index; +} + +static inline int GeluApproximateAVX(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in = SIMD_LD_F32(src + index); + SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in); + SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in); + SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f))); + } + return index; +} + +static inline int GeluAVX(int index, const float *src, int length, float *dst) { + SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f); + SIMD_F32 para2 = SIMD_MOV_F32(1.0f); + SIMD_F32 para3 = SIMD_MOV_F32(0.5f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in = SIMD_LD_F32(src + index); + SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1)))); + SIMD_ST_F32(dst + index, res); + } + return index; +} + +static inline int EluAVX(int index, const float *src, int length, float *dst, float alpha) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f); + SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); + } + return index; +} + +static inline int CeluAVX(int index, const float *src, int length, float *dst, float alpha) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f); + SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); + } + return index; +} + +static inline int HShrinkAVX(int index, const float *src, int length, float *dst, float lambd) { + const float neg_lambd = -1 * lambd; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd)); + SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp); + SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask)); + } + return index; +} + +static inline int SoftShrinkAVX(int index, const float *src, int length, float *dst, float lambd) { + SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); + SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); + + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_t = SIMD_LD_F32(src + index); + /* v0 = (in > lamdb) & (in - lamdb) */ + SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v)); + /* v1 = (in < -lamdb) & (in + lamdb) */ + SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v)); + /* out = (v0 | v1) */ + SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1)); + } + return index; +} + +static inline int SoftsignFp32OptAVX(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp)); + SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h new file mode 100644 index 00000000..435d24c5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/activation_grad_avx.h @@ -0,0 +1,57 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX_H_ +#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int ShrinkGradAVX(int index, const float *src0, const float *src1, + int length, float *dst, float lambd) { + SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); + SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); + + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src0_t = SIMD_LD_F32(src0 + index); + SIMD_F32 src1_t = SIMD_LD_F32(src1 + index); + + SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v); + SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t); + SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); + + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h new file mode 100644 index 00000000..54743d80 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/adam_fp32_avx.h @@ -0,0 +1,210 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_AVX_H_ +#define MINDSPORE_NNACL_FP32_ADAM_FP32_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX +#ifdef MS_SIMD_AVX512 + static inline size_t AdamWeightDecayFp32AVX(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + const float *gradient, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient + index); + + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_F32(var + index, var_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp32Fp16AVX(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); + + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(var + index, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp32Fp32AVX(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); + + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(var + index, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp16Fp16AVX(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); + } + + return index; +} + +static inline size_t FusedCastAdamFp16Fp32AVX(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); + } + + return index; +} +#endif + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h new file mode 100644 index 00000000..716c25b1 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/add_fp32_avx.h @@ -0,0 +1,124 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_ADD_AVX_H_ +#define MINDSPORE_NNACL_FP32_ADD_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int ElementOptAddAVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddIntAVX(int index, const int *in0, const int *in1, int *out, + int size) { + SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddReluAVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddRelu6AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddAVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddReluAVX(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddRelu6AVX(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddIntAVX(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h new file mode 100644 index 00000000..9dd24100 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_fp32_avx.h @@ -0,0 +1,254 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_ARITHMETIC_AVX_H_ +#define MINDSPORE_NNACL_ARITHMETIC_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +#ifndef MS_SIMD_NEON +static inline int ElementFloorModAVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorModNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorModNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementFloorDivAVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, floor_tmp); + } + return index; +} + +static inline int ElementOptFloorDivNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} +#endif + +static inline int ElementFloorDivIntAVX(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMaximumAVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMaximumIntAVX(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMinimumIntAVX(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMinimumAVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h new file mode 100644 index 00000000..c48500f4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/arithmetic_self_fp32_avx.h @@ -0,0 +1,129 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_AVX_H_ +#define MINDSPORE_NNACL_ARITHMETIC_SELF_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +#if defined(MS_SIMD_AVX512) +// only avx512 support abs fp32 instruction +static inline int ElementAbsAVX(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +static inline int ElementAbsIntAVX(int index, const int *input, int *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index))); + } + return index; +} +#endif + +static inline int ElementSquareAVX(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin = SIMD_LD_F32(input + index); + SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin)); + } + return index; +} + +static inline int ElementSqrtAVX(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +static inline int ElementRsqrtAVX(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE) +// avx512 dont support round fp32 instruction +static inline int ElementRoundAVX(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +#ifndef MS_SIMD_NEON +// neon dont support floor fp32 instruction +static inline int ElementFloorAVX(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +#ifndef MS_SIMD_NEON +static inline int ElementCeilAVX(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +static inline int ElementNegativeAVX(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f)); + } + return index; +} + +static inline int ElementNegativeIntAVX(int index, const int *input, int *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1)); + } + return index; +} + +static inline int ElementReciprocalAVX(int index, const float *input, float *output, const int element_size) { + SIMD_F32 num1 = SIMD_MOV_F32(1.0f); + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index))); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h new file mode 100644 index 00000000..11a9087b --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/batchnorm_fp32_avx.h @@ -0,0 +1,67 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int BatchNormFp32AVX(int index, const float *input, const float *mean, + const float *variance, int channel, float epsilon, float *output) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_data = SIMD_LD_F32(input + index); + SIMD_F32 mean_ = SIMD_LD_F32(mean + index); + SIMD_F32 variance_ = SIMD_LD_F32(variance + index); + SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); + SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); + SIMD_ST_F32(output + index, output_data); + } + return index; +} + +static inline int FusedBatchNormFp32AVX(int index, const float *input, const float *scale, + const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_data = SIMD_LD_F32(input + index); + SIMD_F32 scale_ = SIMD_LD_F32(scale + index); + SIMD_F32 offset_ = SIMD_LD_F32(offset + index); + SIMD_F32 mean_ = SIMD_LD_F32(mean + index); + SIMD_F32 variance_ = SIMD_LD_F32(variance + index); + SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); + SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); + SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_); + SIMD_ST_F32(output + index, output_data); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h new file mode 100644 index 00000000..9da68a79 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bce_with_logits_loss_fp32_avx.h @@ -0,0 +1,69 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX_H_ +#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int BCEWithLogitLossAVX(int index, const float *logits, const float *label, + const float *weight, const float *pos_weight, int length, bool reduction, float *output, + float *reduction_sum) { + SIMD_F32 zero = SIMD_SET0_F32; + SIMD_F32 ones = SIMD_MOV_F32(1.0f); + SIMD_F32 middle_output = SIMD_SET0_F32; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index); + SIMD_F32 label_tmp = SIMD_LD_F32(label + index); + SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index); + SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index); + SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp); + SIMD_F32 max_value = neg_logits_tmp; + max_value = SIMD_MIN_F32(max_value, zero); + SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value); + SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones); + SIMD_F32 log_exp_value = + SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value)))); + SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp), + SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value))); + if (reduction) { + middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output); + } else { + SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp)); + } + } + if (reduction) { + *reduction_sum += SIMD_GET_SUM_F32(middle_output); + } + return index; +} +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h new file mode 100644 index 00000000..e54588bb --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/bias_add_avx.h @@ -0,0 +1,64 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_AVX_H_ +#define MINDSPORE_NNACL_FP32_BIAS_ADD_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int BiasAddByInnerCoreAVX(int index, const float *input, const float *bias, float *output, + int64_t num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(input + index); + SIMD_F32 vin1 = SIMD_LD_F32(bias + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); + SIMD_ST_F32(output + index, vout); + } + return index; +} + +static inline int BiasAddByBatchCoreAVX(int index, const float *input, const float *bias, float *output1, + float *output2, float *output3, float *output4, int64_t num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_LDX4_F32(input_data, input + index, num); + SIMD_F32 bias_data = SIMD_LD_F32(bias + index); + SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data)); + SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data)); + SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data)); + SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +}; +#endif + +#endif // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h new file mode 100644 index 00000000..44176549 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cast_base_avx.h @@ -0,0 +1,56 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_AVX_H_ +#define MINDSPORE_NNACL_BASE_CAST_BASE_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int Int32ToFloat32AVX(int index, const int32_t *input, float *output, int number) { + for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 value = SIMD_LD_EPI32(input + index); + SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value)); + } + return index; +} + +#ifndef MS_SIMD_NEON +static inline int Float32ToInt32AVX(int index, const float *input, int32_t *output, int number) { + for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(input + index); + SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value)); + } + return index; +} +#endif + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h new file mode 100644 index 00000000..dac9efa9 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cdist_fp32_avx.h @@ -0,0 +1,70 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_CDIST_AVX_H_ +#define MINDSPORE_NNACL_FP32_CDIST_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int64_t CdistTwoNormalOptAVX(int64_t index, const float *a, const float *b, + float *out, int64_t size) { + SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); + for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_vec = SIMD_LD_F32(a + index); + SIMD_F32 b_vec = SIMD_LD_F32(b + index); + SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); + tmp_vec = SIMD_ABS_F32(tmp_vec); + result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec); + } + *out += SIMD_GET_SUM_F32(result_vec); + + return index; +} + +static inline int64_t CdistPNormalOptAVX(int64_t index, const float *a, const float *b, + float *out, int64_t size, float p) { + SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); + SIMD_F32 p_vec = SIMD_MOV_F32(p); + for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_vec = SIMD_LD_F32(a + index); + SIMD_F32 b_vec = SIMD_LD_F32(b + index); + SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); + tmp_vec = SIMD_ABS_F32(tmp_vec); + tmp_vec = SIMD_POW_F32(tmp_vec, p_vec); + result_vec = SIMD_ADD_F32(tmp_vec, result_vec); + } + *out += SIMD_GET_SUM_F32(result_vec); + + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h new file mode 100644 index 00000000..7407942f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/cumsum_fp32_avx.h @@ -0,0 +1,121 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_CUMSUM_AVX_H_ +#define MINDSPORE_NNACL_FP32_CUMSUM_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +// (a, b, c) -> (a, a+b, a+b+c) exclusive == false +// (a, b, c) -> (0, a, a+b) exclusive == true +static inline int64_t CumsumOutputInitWithInputAVX(int64_t index, const float *layer_input, + float *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index)); + } + return index; +} + +static inline int64_t CumsumOutputInitWithZeroAVX(int64_t index, float *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f)); + } + return index; +} + +static inline int64_t CumsumAVX(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_val = SIMD_LD_F32(layer_input + index); + SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index); + SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); + SIMD_ST_F32(layer_output + index, out_val); + } + return index; +} + +// (a, b, c) -> (c+b+a, c+b, c) exclusive==false +// (a, b, c) -> (c+b, c, 0) exclusive==true +static inline int64_t CumsumReverseAVX(int64_t index, const float *layer_input, float *layer_output, + float *layer_last_output, int inner_dim) { + + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1); + SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1); + SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); + SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val); + } + return index; +} + +// (a, b, c) -> (a, a+b, a+b+c) exclusive == false +// (a, b, c) -> (0, a, a+b) exclusive == true +static inline int64_t CumsumIntOutputInitWithInputAVX(int64_t index, const int *layer_input, + int *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index)); + } + return index; +} + +static inline int64_t CumsumIntOutputInitWithZeroAVX(int64_t index, int *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f)); + } + return index; +} + +static inline int64_t CumsumIntAVX(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index); + SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index); + SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); + SIMD_ST_EPI32(layer_output + index, out_val); + } + return index; +} + +// (a, b, c) -> (c+b+a, c+b, c) exclusive==false +// (a, b, c) -> (c+b, c, 0) exclusive==true +static inline int64_t CumsumReverseIntAVX(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1); + SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1); + SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); + SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h new file mode 100644 index 00000000..3710151e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/div_fp32_avx.h @@ -0,0 +1,167 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_DIV_AVX_H_ +#define MINDSPORE_NNACL_FP32_DIV_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int ElementOptDivNum0AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivNum1AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivReluNum0AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivReluNum1AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivRelu6Num0AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivRelu6Num1AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivAVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivIntAVX(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementDivReluAVX(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivRelu6AVX(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h new file mode 100644 index 00000000..cbd4eca5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/dropout_fp32_avx.h @@ -0,0 +1,46 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX_H_ +#define MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int DropoutFp32AVX(int index, const float *input, float scale, + int length, float *output) { + SIMD_F32 scale_value = SIMD_MOV_F32(scale); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value)); + } + return index; +} +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h new file mode 100644 index 00000000..cf7cbd37 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/exp_fp32_avx.h @@ -0,0 +1,63 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_DIV_AVX_H_ +#define MINDSPORE_NNACL_FP32_DIV_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int64_t ExpFp32AVX(int64_t index, const float *src, float *dst, int num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); + } + return index; +} + +static inline int64_t ExpFp32WithInScaleAVX(int64_t index, const float *src, float *dst, int num, float in_scale) { + SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index); + } + return index; +} + +static inline int64_t ExpFp32WithOutScaleAVX(int64_t index, const float *src, float *dst, int num, float out_scale) { + SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); + SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h new file mode 100644 index 00000000..8b01844e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/fill_base_avx.h @@ -0,0 +1,53 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_AVX_H_ +#define MINDSPORE_NNACL_BASE_FILL_BASE_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int FillFp32AVX(int index, float *output, int size, float data) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MOV_F32(data)); + } + return index; +} + +static inline int FillInt32AVX(int index, int *output, int size, int data) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif + diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h new file mode 100644 index 00000000..d5076e59 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/group_norm_fp32_avx.h @@ -0,0 +1,77 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX_H_ +#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int64_t GroupNormFp32AVX(int64_t index, const float *unit_input, float scale, float offset, float mean, + float var_sqrt, int unit, float *unit_output) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt); + SIMD_F32 scale_val = SIMD_MOV_F32(scale); + SIMD_F32 offset_val = SIMD_MOV_F32(offset); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(unit_input + index); + SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt); + SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val); + SIMD_ST_F32(unit_output + index, output); + } + return index; +} + +static inline int64_t GroupNormReduceSumAVX(int64_t index, const float *in, float *sum, int unit) { + if (unit - index >= 4 * BLOCK_NUM) { + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index)); + } + *sum += SIMD_GET_SUM_F32(tmp); + } + return index; +} + +static inline int64_t GroupNormReduceVarAVX(int64_t index, const float *in, float mean, float *sum, int unit) { + if (unit - index >= 4 * BLOCK_NUM) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val); + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input)); + } + *sum += SIMD_GET_SUM_F32(tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h new file mode 100644 index 00000000..96fdf185 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/layer_norm_fp32_avx.h @@ -0,0 +1,68 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX_H_ +#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int LayerNormMeanAndSquareAVX(int index, const float *src, int num, float *mean, float *square_mean) { + if (num >= 4 * BLOCK_NUM) { + SIMD_F32 sum_val = SIMD_SET0_F32; + SIMD_F32 square_sum_val = SIMD_SET0_F32; + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(src + index); + SIMD_F32 square_value = SIMD_MUL_F32(value, value); + sum_val = SIMD_ADD_F32(sum_val, value); + square_sum_val = SIMD_ADD_F32(square_sum_val, square_value); + } + *mean += SIMD_GET_SUM_F32(sum_val); + *square_mean += SIMD_GET_SUM_F32(square_sum_val); + } + return index; +} + +static inline int LayerNormGammaAndBetaAVX(int index, float *dst, const float *src, const float *gamma_data, + const float *beta_data, int num, const float mean, const float deno) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 deno_val = SIMD_MOV_F32(deno); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(src + index); + SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val); + out_value = SIMD_MUL_F32(out_value, deno_val); + out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index)); + SIMD_ST_F32(dst + index, out_value); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h new file mode 100644 index 00000000..523e120e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/matmul_fp32_avx.h @@ -0,0 +1,93 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_AVX_H_ +#define MINDSPORE_NNACL_FP32_MATMUL_F32_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6. +static inline int64_t GemmIsNotPackAVX(int64_t index, const float *a, const float *b, float *c, const float *bias, int row, + int deep, int act_type) { + SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f); + SIMD_F32 up_threshold = SIMD_MOV_F32(6); + SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]); + SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]); + for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_data = SIMD_LD_F32(a + index); + SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16); + if (act_type != 0) { + dst = SIMD_MAX_F32(dst, down_threshold); + if (act_type == 3) { + dst = SIMD_MIN_F32(dst, up_threshold); + } + } + SIMD_ST_F32(c + index, dst); + } + + return index; +} + +#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX) +static inline int64_t GemmIsNotPackOptimizeCoreAVX(int64_t index, const float *a, const float *b, int k, float *dst) { + SIMD_F32 dst1 = SIMD_MOV_F32(0.0f); + for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 weight = SIMD_LD_F32(b + index); + SIMD_F32 a1 = SIMD_LD_F32(a + index); + dst1 = SIMD_FMADD_F32(weight, a1, dst1); + } + *dst += SIMD_REDUCE_ADD_F32(dst1); + return index; +} +#endif + +static inline int64_t MatVecMulNoPackCoreAVX(int64_t oc_index, const float *a, const float *b, float *c, const float *bias, + int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) { + for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) { + SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index)); + for (int64_t k = 0; k < depth; ++k) { + SIMD_F32 left = SIMD_MOV_F32(a[k]); + SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col); + out = SIMD_FMADD_F32(left, right, out); + } + if ((inc_flag & 0x2) != 0 && act_type != 0) { + out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f)); + if (act_type == 0x3) { + out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f)); + } + } + SIMD_ST_F32(c + oc_index, out); + } + return oc_index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h new file mode 100644 index 00000000..a5d8b0a0 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/mul_fp32_avx.h @@ -0,0 +1,218 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int ElementMulAVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulReluAVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulRelu6AVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulIntAVX(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementMulReluIntAVX(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementMulRelu6IntAVX(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluNum0AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluNum1AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6Num0AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6Num1AVX(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6IntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6IntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h new file mode 100644 index 00000000..d4bd2305 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/pooling_fp32_avx.h @@ -0,0 +1,84 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_POOLING_AVX_H_ +#define MINDSPORE_NNACL_FP32_POOLING_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int AvgPoolingBatchAVX(int ci, const float *src_plane_ptr, int channel, + float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, + int in_h_index, int in_w, int in_w_index, float minf, float maxf) { + SIMD_F32 min_val = SIMD_MOV_F32(minf); + SIMD_F32 max_val = SIMD_MOV_F32(maxf); + for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + SIMD_F32 tmp_avg = SIMD_SET0_F32; + int real_count = 0; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = real_win_w_start; w < real_win_w_end; w++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr)); + ++real_count; + } + } + tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count)); + tmp_avg = SIMD_MAX_F32(tmp_avg, min_val); + tmp_avg = SIMD_MIN_F32(tmp_avg, max_val); + SIMD_ST_F32(dst_c_ptr, tmp_avg); + } + return ci; +} + +static inline int MaxPoolingBatchAVX(int ci, const float *src_plane_ptr, int channel, + float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, + int in_h_index, int in_w, int in_w_index, float minf, float maxf) { + SIMD_F32 min_val = SIMD_MOV_F32(minf); + SIMD_F32 max_val = SIMD_MOV_F32(maxf); + for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + SIMD_F32 tmp_max = min_val; + for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { + for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; + tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr)); + } + } + tmp_max = SIMD_MIN_F32(tmp_max, max_val); + SIMD_ST_F32(dst_c_ptr, tmp_max); + } + return ci; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h new file mode 100644 index 00000000..2ada6cb3 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/power_fp32_avx.h @@ -0,0 +1,101 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_POWER_AVX_H_ +#define MINDSPORE_NNACL_FP32_POWER_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int PowerBroadCastIntExponentAVX(int index, const float *input, int exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + SIMD_F32 result = SIMD_MOV_F32(1.0f); + int exp = abs(exponent); + while (exp) { + if (exp % 2) { + result = SIMD_MUL_F32(result, tmp); + } + tmp = SIMD_MUL_SQUARE_F32(tmp); + exp = exp / 2; + } + SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result)); + } + return index; +} + +static inline int PowerBroadCastFloatExponentAVX(int index, const float *input, float exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + SIMD_F32 result; + for (int i = 0; i < BLOCK_NUM; ++i) { + SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent); + } + SIMD_ST_F32(output + index, result); + } + return index; +} + +static inline int PowerSingleExponentAVX(int index, const float *input, const float *exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + for (int j = 0; j < BLOCK_NUM; ++j) { + float cur_exponent = exponent[index + j]; + float cur_val = SIMD_F32_GETI(tmp_vec, j); + if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) { + int exp = abs((int)(cur_exponent)); + float result = 1; + while (exp) { + if (exp % 2) { + result *= cur_val; + } + cur_val *= cur_val; + exp = exp / 2; + } + output[index + j] = *exponent >= 0 ? result : 1 / result; + } else { + output[index + j] = powf(cur_val, cur_exponent); + } + } + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h new file mode 100644 index 00000000..03339e42 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/reduce_fp32_avx.h @@ -0,0 +1,181 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX_H_ +#define MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int64_t ReduceSumAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceMeanAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size)); + } + return index; +} + +static inline int64_t ReduceMinAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceMaxAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceProdAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(1.0f); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceSumSquareAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceL2NormAVX(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); + } + SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp)); + } + return index; +} + +static inline int64_t IntReduceSumAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t IntReduceMeanAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size)); + } + return index; +} + +static inline int64_t IntReduceMinAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t IntReduceMaxAVX(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h new file mode 100644 index 00000000..8229111d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/softmax_fp32_avx.h @@ -0,0 +1,87 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_AVX_H_ +#define MINDSPORE_NNACL_FP32_SOFTMAX_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int64_t SoftmaxNormGetMaxAVX(int64_t index, const float *src, int cur_batch_offset, + float *max, int channel) { + if (channel >= BLOCK_NUM * BLOCK_NUM) { + SIMD_F32 max_val = SIMD_MOV_F32(*max); + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index)); + } + *max = SIMD_GET_MAX_F32(max_val); + } + return index; +} + +static inline int64_t SoftmaxNormCalcNormAVX(int64_t index, const float *src, float *dst, + int cur_batch_offset, float max, int channel) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max)); + SIMD_ST_F32(dst + cur_batch_offset + index, output); + } + return index; +} + +static inline int64_t SoftmaxLastAxisGetExpSumAVX(int64_t index, const float *src, float *dst, + int cur_batch_offset, float max, float *exp_sum, int channel) { +#ifndef _WIN32 + SIMD_F32 sum_val = SIMD_SET0_F32; + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); + SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max)); + SIMD_F32 exp_out = SIMD_EXP_F32(output); + sum_val = SIMD_ADD_F32(sum_val, exp_out); + SIMD_ST_F32(dst + cur_batch_offset + index, exp_out); + } + *exp_sum += SIMD_GET_SUM_F32(sum_val); +#endif + return index; +} + +static inline int64_t SoftmaxLastAxisGetResultAVX(int64_t index, const float *src, float *dst, + int cur_batch_offset, float exp_sum, int channel) { + SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum); + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); + SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val); + SIMD_ST_F32(dst + cur_batch_offset + index, output); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h new file mode 100644 index 00000000..a3ed93d4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx/sub_fp32_avx.h @@ -0,0 +1,167 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_SUB_AVX_H_ +#define MINDSPORE_NNACL_FP32_SUB_AVX_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx", "avx2") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX_INSTRUCTION +#define BLOCK_NUM 8 +#define MS_SIMD_AVX + +static inline int ElementOptSubNum0AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubNum1AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubIntNum0AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubIntNum1AVX(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubReluNum0AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubReluNum1AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubRelu6Num0AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubRelu6Num1AVX(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubAVX(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubIntAVX(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementSubReluAVX(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubRelu6AVX(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h new file mode 100644 index 00000000..f6457628 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_fp32_avx512.h @@ -0,0 +1,221 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int Fp32ReluAVX512(int index, const float *src, int length, float *dst) { + SIMD_F32 zero = SIMD_SET0_F32; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero)); + } + return index; +} + +static inline int Int32ReluAVX512(int index, const int32_t *src, int length, int32_t *dst) { + SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero)); + } + return index; +} + +static inline int Fp32Relu6AVX512(int index, const float *src, int length, float *dst) { + SIMD_F32 zero = SIMD_SET0_F32; + SIMD_F32 six = SIMD_MOV_F32(6.0f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six)); + } + return index; +} + +static inline int LReluAVX512(int index, const float *src, int length, float *dst, float alpha) { + SIMD_F32 alpha_data = SIMD_MOV_F32(alpha); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask)); + } + return index; +} + +static inline int SigmoidAVX512(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index); + SIMD_ST_F32(dst + index, + SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); + } + return index; +} + +static inline int TanhAVX512(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + index); + SIMD_ST_F32(dst + index, SIMD_TANH_F32(input)); + } + return index; +} + +static inline int SwishAVX512(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index); + SIMD_ST_F32(dst + index, + SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); + } + return index; +} + +static inline int HSwishAVX512(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); + SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6)); + } + return index; +} + +static inline int HSigmoidAVX512(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); + SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6)); + } + return index; +} + +static inline int HardTanhNoLimitMinAVX512(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val)); + } + return index; +} + +static inline int HardTanhNoLimitMaxAVX512(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val)); + } + return index; +} + +static inline int HardTanhLimitMinMaxAVX512(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val)); + } + return index; +} + +static inline int GeluApproximateAVX512(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in = SIMD_LD_F32(src + index); + SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in); + SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in); + SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f))); + } + return index; +} + +static inline int GeluAVX512(int index, const float *src, int length, float *dst) { + SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f); + SIMD_F32 para2 = SIMD_MOV_F32(1.0f); + SIMD_F32 para3 = SIMD_MOV_F32(0.5f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in = SIMD_LD_F32(src + index); + SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1)))); + SIMD_ST_F32(dst + index, res); + } + return index; +} + +static inline int EluAVX512(int index, const float *src, int length, float *dst, float alpha) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f); + SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); + } + return index; +} + +static inline int CeluAVX512(int index, const float *src, int length, float *dst, float alpha) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f); + SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); + } + return index; +} + +static inline int HShrinkAVX512(int index, const float *src, int length, float *dst, float lambd) { + const float neg_lambd = -1 * lambd; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd)); + SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp); + SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask)); + } + return index; +} + +static inline int SoftShrinkAVX512(int index, const float *src, int length, float *dst, float lambd) { + SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); + SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); + + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_t = SIMD_LD_F32(src + index); + /* v0 = (in > lamdb) & (in - lamdb) */ + SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v)); + /* v1 = (in < -lamdb) & (in + lamdb) */ + SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v)); + /* out = (v0 | v1) */ + SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1)); + } + return index; +} + +static inline int SoftsignFp32OptAVX512(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp)); + SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h new file mode 100644 index 00000000..62d34db4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/activation_grad_avx512.h @@ -0,0 +1,57 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX512_H_ +#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int ShrinkGradAVX512(int index, const float *src0, const float *src1, + int length, float *dst, float lambd) { + SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); + SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); + + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src0_t = SIMD_LD_F32(src0 + index); + SIMD_F32 src1_t = SIMD_LD_F32(src1 + index); + + SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v); + SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t); + SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); + + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h new file mode 100644 index 00000000..0579d58a --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/adam_fp32_avx512.h @@ -0,0 +1,210 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_AVX512_H_ +#define MINDSPORE_NNACL_FP32_ADAM_FP32_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 +#ifdef MS_SIMD_AVX512 + static inline size_t AdamWeightDecayFp32AVX512(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + const float *gradient, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient + index); + + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_F32(var + index, var_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp32Fp16AVX512(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); + + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(var + index, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp32Fp32AVX512(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); + + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(var + index, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp16Fp16AVX512(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); + } + + return index; +} + +static inline size_t FusedCastAdamFp16Fp32AVX512(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); + } + + return index; +} +#endif + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h new file mode 100644 index 00000000..5ec6a42e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/add_fp32_avx512.h @@ -0,0 +1,124 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_ADD_AVX512_H_ +#define MINDSPORE_NNACL_FP32_ADD_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int ElementOptAddAVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddIntAVX512(int index, const int *in0, const int *in1, int *out, + int size) { + SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddReluAVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddRelu6AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddAVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddReluAVX512(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddRelu6AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h new file mode 100644 index 00000000..aa478969 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_fp32_avx512.h @@ -0,0 +1,254 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_ARITHMETIC_AVX512_H_ +#define MINDSPORE_NNACL_ARITHMETIC_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +#ifndef MS_SIMD_NEON +static inline int ElementFloorModAVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorModNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorModNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementFloorDivAVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, floor_tmp); + } + return index; +} + +static inline int ElementOptFloorDivNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} +#endif + +static inline int ElementFloorDivIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMaximumAVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMaximumIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMinimumIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMinimumAVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h new file mode 100644 index 00000000..c671e327 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/arithmetic_self_fp32_avx512.h @@ -0,0 +1,129 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_AVX512_H_ +#define MINDSPORE_NNACL_ARITHMETIC_SELF_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +#if defined(MS_SIMD_AVX512) +// only avx512 support abs fp32 instruction +static inline int ElementAbsAVX512(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +static inline int ElementAbsIntAVX512(int index, const int *input, int *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index))); + } + return index; +} +#endif + +static inline int ElementSquareAVX512(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin = SIMD_LD_F32(input + index); + SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin)); + } + return index; +} + +static inline int ElementSqrtAVX512(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +static inline int ElementRsqrtAVX512(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE) +// avx512 dont support round fp32 instruction +static inline int ElementRoundAVX512(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +#ifndef MS_SIMD_NEON +// neon dont support floor fp32 instruction +static inline int ElementFloorAVX512(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +#ifndef MS_SIMD_NEON +static inline int ElementCeilAVX512(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +static inline int ElementNegativeAVX512(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f)); + } + return index; +} + +static inline int ElementNegativeIntAVX512(int index, const int *input, int *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1)); + } + return index; +} + +static inline int ElementReciprocalAVX512(int index, const float *input, float *output, const int element_size) { + SIMD_F32 num1 = SIMD_MOV_F32(1.0f); + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index))); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h new file mode 100644 index 00000000..fd945984 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/batchnorm_fp32_avx512.h @@ -0,0 +1,67 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int BatchNormFp32AVX512(int index, const float *input, const float *mean, + const float *variance, int channel, float epsilon, float *output) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_data = SIMD_LD_F32(input + index); + SIMD_F32 mean_ = SIMD_LD_F32(mean + index); + SIMD_F32 variance_ = SIMD_LD_F32(variance + index); + SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); + SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); + SIMD_ST_F32(output + index, output_data); + } + return index; +} + +static inline int FusedBatchNormFp32AVX512(int index, const float *input, const float *scale, + const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_data = SIMD_LD_F32(input + index); + SIMD_F32 scale_ = SIMD_LD_F32(scale + index); + SIMD_F32 offset_ = SIMD_LD_F32(offset + index); + SIMD_F32 mean_ = SIMD_LD_F32(mean + index); + SIMD_F32 variance_ = SIMD_LD_F32(variance + index); + SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); + SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); + SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_); + SIMD_ST_F32(output + index, output_data); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h new file mode 100644 index 00000000..f5353f61 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bce_with_logits_loss_fp32_avx512.h @@ -0,0 +1,69 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX512_H_ +#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int BCEWithLogitLossAVX512(int index, const float *logits, const float *label, + const float *weight, const float *pos_weight, int length, bool reduction, float *output, + float *reduction_sum) { + SIMD_F32 zero = SIMD_SET0_F32; + SIMD_F32 ones = SIMD_MOV_F32(1.0f); + SIMD_F32 middle_output = SIMD_SET0_F32; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index); + SIMD_F32 label_tmp = SIMD_LD_F32(label + index); + SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index); + SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index); + SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp); + SIMD_F32 max_value = neg_logits_tmp; + max_value = SIMD_MIN_F32(max_value, zero); + SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value); + SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones); + SIMD_F32 log_exp_value = + SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value)))); + SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp), + SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value))); + if (reduction) { + middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output); + } else { + SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp)); + } + } + if (reduction) { + *reduction_sum += SIMD_GET_SUM_F32(middle_output); + } + return index; +} +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h new file mode 100644 index 00000000..abdad5ff --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/bias_add_avx512.h @@ -0,0 +1,64 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_AVX512_H_ +#define MINDSPORE_NNACL_FP32_BIAS_ADD_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int BiasAddByInnerCoreAVX512(int index, const float *input, const float *bias, float *output, + int64_t num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(input + index); + SIMD_F32 vin1 = SIMD_LD_F32(bias + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); + SIMD_ST_F32(output + index, vout); + } + return index; +} + +static inline int BiasAddByBatchCoreAVX512(int index, const float *input, const float *bias, float *output1, + float *output2, float *output3, float *output4, int64_t num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_LDX4_F32(input_data, input + index, num); + SIMD_F32 bias_data = SIMD_LD_F32(bias + index); + SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data)); + SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data)); + SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data)); + SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +}; +#endif + +#endif // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h new file mode 100644 index 00000000..91d52718 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cast_base_avx512.h @@ -0,0 +1,56 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_AVX512_H_ +#define MINDSPORE_NNACL_BASE_CAST_BASE_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int Int32ToFloat32AVX512(int index, const int32_t *input, float *output, int number) { + for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 value = SIMD_LD_EPI32(input + index); + SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value)); + } + return index; +} + +#ifndef MS_SIMD_NEON +static inline int Float32ToInt32AVX512(int index, const float *input, int32_t *output, int number) { + for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(input + index); + SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value)); + } + return index; +} +#endif + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h new file mode 100644 index 00000000..11a2abcf --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cdist_fp32_avx512.h @@ -0,0 +1,70 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_CDIST_AVX512_H_ +#define MINDSPORE_NNACL_FP32_CDIST_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int64_t CdistTwoNormalOptAVX512(int64_t index, const float *a, const float *b, + float *out, int64_t size) { + SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); + for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_vec = SIMD_LD_F32(a + index); + SIMD_F32 b_vec = SIMD_LD_F32(b + index); + SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); + tmp_vec = SIMD_ABS_F32(tmp_vec); + result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec); + } + *out += SIMD_GET_SUM_F32(result_vec); + + return index; +} + +static inline int64_t CdistPNormalOptAVX512(int64_t index, const float *a, const float *b, + float *out, int64_t size, float p) { + SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); + SIMD_F32 p_vec = SIMD_MOV_F32(p); + for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_vec = SIMD_LD_F32(a + index); + SIMD_F32 b_vec = SIMD_LD_F32(b + index); + SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); + tmp_vec = SIMD_ABS_F32(tmp_vec); + tmp_vec = SIMD_POW_F32(tmp_vec, p_vec); + result_vec = SIMD_ADD_F32(tmp_vec, result_vec); + } + *out += SIMD_GET_SUM_F32(result_vec); + + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h new file mode 100644 index 00000000..f82adabf --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/cumsum_fp32_avx512.h @@ -0,0 +1,121 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_CUMSUM_AVX512_H_ +#define MINDSPORE_NNACL_FP32_CUMSUM_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +// (a, b, c) -> (a, a+b, a+b+c) exclusive == false +// (a, b, c) -> (0, a, a+b) exclusive == true +static inline int64_t CumsumOutputInitWithInputAVX512(int64_t index, const float *layer_input, + float *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index)); + } + return index; +} + +static inline int64_t CumsumOutputInitWithZeroAVX512(int64_t index, float *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f)); + } + return index; +} + +static inline int64_t CumsumAVX512(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_val = SIMD_LD_F32(layer_input + index); + SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index); + SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); + SIMD_ST_F32(layer_output + index, out_val); + } + return index; +} + +// (a, b, c) -> (c+b+a, c+b, c) exclusive==false +// (a, b, c) -> (c+b, c, 0) exclusive==true +static inline int64_t CumsumReverseAVX512(int64_t index, const float *layer_input, float *layer_output, + float *layer_last_output, int inner_dim) { + + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1); + SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1); + SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); + SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val); + } + return index; +} + +// (a, b, c) -> (a, a+b, a+b+c) exclusive == false +// (a, b, c) -> (0, a, a+b) exclusive == true +static inline int64_t CumsumIntOutputInitWithInputAVX512(int64_t index, const int *layer_input, + int *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index)); + } + return index; +} + +static inline int64_t CumsumIntOutputInitWithZeroAVX512(int64_t index, int *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f)); + } + return index; +} + +static inline int64_t CumsumIntAVX512(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index); + SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index); + SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); + SIMD_ST_EPI32(layer_output + index, out_val); + } + return index; +} + +// (a, b, c) -> (c+b+a, c+b, c) exclusive==false +// (a, b, c) -> (c+b, c, 0) exclusive==true +static inline int64_t CumsumReverseIntAVX512(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1); + SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1); + SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); + SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h new file mode 100644 index 00000000..4de588fb --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/div_fp32_avx512.h @@ -0,0 +1,167 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_DIV_AVX512_H_ +#define MINDSPORE_NNACL_FP32_DIV_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int ElementOptDivNum0AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivNum1AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivReluNum0AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivReluNum1AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivAVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementDivReluAVX512(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivRelu6AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h new file mode 100644 index 00000000..eb847c23 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/dropout_fp32_avx512.h @@ -0,0 +1,46 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX512_H_ +#define MINDSPORE_NNACL_FP32_DROPOUTFP32_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int DropoutFp32AVX512(int index, const float *input, float scale, + int length, float *output) { + SIMD_F32 scale_value = SIMD_MOV_F32(scale); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value)); + } + return index; +} +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h new file mode 100644 index 00000000..14386f5f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/exp_fp32_avx512.h @@ -0,0 +1,63 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_DIV_AVX512_H_ +#define MINDSPORE_NNACL_FP32_DIV_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int64_t ExpFp32AVX512(int64_t index, const float *src, float *dst, int num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); + } + return index; +} + +static inline int64_t ExpFp32WithInScaleAVX512(int64_t index, const float *src, float *dst, int num, float in_scale) { + SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index); + } + return index; +} + +static inline int64_t ExpFp32WithOutScaleAVX512(int64_t index, const float *src, float *dst, int num, float out_scale) { + SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); + SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h new file mode 100644 index 00000000..5eb04746 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/fill_base_avx512.h @@ -0,0 +1,53 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_AVX512_H_ +#define MINDSPORE_NNACL_BASE_FILL_BASE_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int FillFp32AVX512(int index, float *output, int size, float data) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MOV_F32(data)); + } + return index; +} + +static inline int FillInt32AVX512(int index, int *output, int size, int data) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif + diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h new file mode 100644 index 00000000..f26537d9 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/group_norm_fp32_avx512.h @@ -0,0 +1,77 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX512_H_ +#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int64_t GroupNormFp32AVX512(int64_t index, const float *unit_input, float scale, float offset, float mean, + float var_sqrt, int unit, float *unit_output) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt); + SIMD_F32 scale_val = SIMD_MOV_F32(scale); + SIMD_F32 offset_val = SIMD_MOV_F32(offset); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(unit_input + index); + SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt); + SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val); + SIMD_ST_F32(unit_output + index, output); + } + return index; +} + +static inline int64_t GroupNormReduceSumAVX512(int64_t index, const float *in, float *sum, int unit) { + if (unit - index >= 4 * BLOCK_NUM) { + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index)); + } + *sum += SIMD_GET_SUM_F32(tmp); + } + return index; +} + +static inline int64_t GroupNormReduceVarAVX512(int64_t index, const float *in, float mean, float *sum, int unit) { + if (unit - index >= 4 * BLOCK_NUM) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val); + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input)); + } + *sum += SIMD_GET_SUM_F32(tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h new file mode 100644 index 00000000..e5fb6d7b --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/layer_norm_fp32_avx512.h @@ -0,0 +1,68 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX512_H_ +#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int LayerNormMeanAndSquareAVX512(int index, const float *src, int num, float *mean, float *square_mean) { + if (num >= 4 * BLOCK_NUM) { + SIMD_F32 sum_val = SIMD_SET0_F32; + SIMD_F32 square_sum_val = SIMD_SET0_F32; + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(src + index); + SIMD_F32 square_value = SIMD_MUL_F32(value, value); + sum_val = SIMD_ADD_F32(sum_val, value); + square_sum_val = SIMD_ADD_F32(square_sum_val, square_value); + } + *mean += SIMD_GET_SUM_F32(sum_val); + *square_mean += SIMD_GET_SUM_F32(square_sum_val); + } + return index; +} + +static inline int LayerNormGammaAndBetaAVX512(int index, float *dst, const float *src, const float *gamma_data, + const float *beta_data, int num, const float mean, const float deno) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 deno_val = SIMD_MOV_F32(deno); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(src + index); + SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val); + out_value = SIMD_MUL_F32(out_value, deno_val); + out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index)); + SIMD_ST_F32(dst + index, out_value); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h new file mode 100644 index 00000000..d51779d4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/matmul_fp32_avx512.h @@ -0,0 +1,93 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_AVX512_H_ +#define MINDSPORE_NNACL_FP32_MATMUL_F32_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6. +static inline int64_t GemmIsNotPackAVX512(int64_t index, const float *a, const float *b, float *c, const float *bias, int row, + int deep, int act_type) { + SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f); + SIMD_F32 up_threshold = SIMD_MOV_F32(6); + SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]); + SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]); + for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_data = SIMD_LD_F32(a + index); + SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16); + if (act_type != 0) { + dst = SIMD_MAX_F32(dst, down_threshold); + if (act_type == 3) { + dst = SIMD_MIN_F32(dst, up_threshold); + } + } + SIMD_ST_F32(c + index, dst); + } + + return index; +} + +#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX) +static inline int64_t GemmIsNotPackOptimizeCoreAVX512(int64_t index, const float *a, const float *b, int k, float *dst) { + SIMD_F32 dst1 = SIMD_MOV_F32(0.0f); + for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 weight = SIMD_LD_F32(b + index); + SIMD_F32 a1 = SIMD_LD_F32(a + index); + dst1 = SIMD_FMADD_F32(weight, a1, dst1); + } + *dst += SIMD_REDUCE_ADD_F32(dst1); + return index; +} +#endif + +static inline int64_t MatVecMulNoPackCoreAVX512(int64_t oc_index, const float *a, const float *b, float *c, const float *bias, + int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) { + for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) { + SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index)); + for (int64_t k = 0; k < depth; ++k) { + SIMD_F32 left = SIMD_MOV_F32(a[k]); + SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col); + out = SIMD_FMADD_F32(left, right, out); + } + if ((inc_flag & 0x2) != 0 && act_type != 0) { + out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f)); + if (act_type == 0x3) { + out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f)); + } + } + SIMD_ST_F32(c + oc_index, out); + } + return oc_index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h new file mode 100644 index 00000000..e3b242e4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/mul_fp32_avx512.h @@ -0,0 +1,218 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int ElementMulAVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulReluAVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulRelu6AVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementMulReluIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementMulRelu6IntAVX512(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluNum0AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluNum1AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6IntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6IntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h new file mode 100644 index 00000000..d1e001ee --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/pooling_fp32_avx512.h @@ -0,0 +1,84 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_POOLING_AVX512_H_ +#define MINDSPORE_NNACL_FP32_POOLING_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int AvgPoolingBatchAVX512(int ci, const float *src_plane_ptr, int channel, + float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, + int in_h_index, int in_w, int in_w_index, float minf, float maxf) { + SIMD_F32 min_val = SIMD_MOV_F32(minf); + SIMD_F32 max_val = SIMD_MOV_F32(maxf); + for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + SIMD_F32 tmp_avg = SIMD_SET0_F32; + int real_count = 0; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = real_win_w_start; w < real_win_w_end; w++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr)); + ++real_count; + } + } + tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count)); + tmp_avg = SIMD_MAX_F32(tmp_avg, min_val); + tmp_avg = SIMD_MIN_F32(tmp_avg, max_val); + SIMD_ST_F32(dst_c_ptr, tmp_avg); + } + return ci; +} + +static inline int MaxPoolingBatchAVX512(int ci, const float *src_plane_ptr, int channel, + float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, + int in_h_index, int in_w, int in_w_index, float minf, float maxf) { + SIMD_F32 min_val = SIMD_MOV_F32(minf); + SIMD_F32 max_val = SIMD_MOV_F32(maxf); + for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + SIMD_F32 tmp_max = min_val; + for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { + for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; + tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr)); + } + } + tmp_max = SIMD_MIN_F32(tmp_max, max_val); + SIMD_ST_F32(dst_c_ptr, tmp_max); + } + return ci; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h new file mode 100644 index 00000000..a31eaf2f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/power_fp32_avx512.h @@ -0,0 +1,101 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_POWER_AVX512_H_ +#define MINDSPORE_NNACL_FP32_POWER_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int PowerBroadCastIntExponentAVX512(int index, const float *input, int exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + SIMD_F32 result = SIMD_MOV_F32(1.0f); + int exp = abs(exponent); + while (exp) { + if (exp % 2) { + result = SIMD_MUL_F32(result, tmp); + } + tmp = SIMD_MUL_SQUARE_F32(tmp); + exp = exp / 2; + } + SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result)); + } + return index; +} + +static inline int PowerBroadCastFloatExponentAVX512(int index, const float *input, float exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + SIMD_F32 result; + for (int i = 0; i < BLOCK_NUM; ++i) { + SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent); + } + SIMD_ST_F32(output + index, result); + } + return index; +} + +static inline int PowerSingleExponentAVX512(int index, const float *input, const float *exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + for (int j = 0; j < BLOCK_NUM; ++j) { + float cur_exponent = exponent[index + j]; + float cur_val = SIMD_F32_GETI(tmp_vec, j); + if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) { + int exp = abs((int)(cur_exponent)); + float result = 1; + while (exp) { + if (exp % 2) { + result *= cur_val; + } + cur_val *= cur_val; + exp = exp / 2; + } + output[index + j] = *exponent >= 0 ? result : 1 / result; + } else { + output[index + j] = powf(cur_val, cur_exponent); + } + } + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h new file mode 100644 index 00000000..5885a044 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/reduce_fp32_avx512.h @@ -0,0 +1,181 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX512_H_ +#define MINDSPORE_NNACL_FP32_REDUCE_FP32_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int64_t ReduceSumAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceMeanAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size)); + } + return index; +} + +static inline int64_t ReduceMinAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceMaxAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceProdAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(1.0f); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceSumSquareAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceL2NormAVX512(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); + } + SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp)); + } + return index; +} + +static inline int64_t IntReduceSumAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t IntReduceMeanAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size)); + } + return index; +} + +static inline int64_t IntReduceMinAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t IntReduceMaxAVX512(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h new file mode 100644 index 00000000..1fa1907e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/softmax_fp32_avx512.h @@ -0,0 +1,87 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_AVX512_H_ +#define MINDSPORE_NNACL_FP32_SOFTMAX_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int64_t SoftmaxNormGetMaxAVX512(int64_t index, const float *src, int cur_batch_offset, + float *max, int channel) { + if (channel >= BLOCK_NUM * BLOCK_NUM) { + SIMD_F32 max_val = SIMD_MOV_F32(*max); + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index)); + } + *max = SIMD_GET_MAX_F32(max_val); + } + return index; +} + +static inline int64_t SoftmaxNormCalcNormAVX512(int64_t index, const float *src, float *dst, + int cur_batch_offset, float max, int channel) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max)); + SIMD_ST_F32(dst + cur_batch_offset + index, output); + } + return index; +} + +static inline int64_t SoftmaxLastAxisGetExpSumAVX512(int64_t index, const float *src, float *dst, + int cur_batch_offset, float max, float *exp_sum, int channel) { +#ifndef _WIN32 + SIMD_F32 sum_val = SIMD_SET0_F32; + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); + SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max)); + SIMD_F32 exp_out = SIMD_EXP_F32(output); + sum_val = SIMD_ADD_F32(sum_val, exp_out); + SIMD_ST_F32(dst + cur_batch_offset + index, exp_out); + } + *exp_sum += SIMD_GET_SUM_F32(sum_val); +#endif + return index; +} + +static inline int64_t SoftmaxLastAxisGetResultAVX512(int64_t index, const float *src, float *dst, + int cur_batch_offset, float exp_sum, int channel) { + SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum); + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); + SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val); + SIMD_ST_F32(dst + cur_batch_offset + index, output); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h new file mode 100644 index 00000000..994fc7c0 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/avx512/sub_fp32_avx512.h @@ -0,0 +1,167 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_SUB_AVX512_H_ +#define MINDSPORE_NNACL_FP32_SUB_AVX512_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_avx512_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("avx512f") +#define MS_SIMD_INSTRUCTION MS_SIMD_AVX512_INSTRUCTION +#define BLOCK_NUM 16 +#define MS_SIMD_AVX512 + +static inline int ElementOptSubNum0AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubNum1AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubIntNum0AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubIntNum1AVX512(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubReluNum0AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubReluNum1AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubRelu6Num0AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubRelu6Num1AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubAVX512(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubIntAVX512(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementSubReluAVX512(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubRelu6AVX512(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_AVX512 +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h new file mode 100644 index 00000000..88908c90 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/batchnorm_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BATCHNORM_FP32_SIMD_H_ +#define MINDSPORE_NNACL_BATCHNORM_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/batchnorm_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/batchnorm_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/batchnorm_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/batchnorm_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h new file mode 100644 index 00000000..f36981ab --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bce_with_logits_loss_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BCE_WITH_LOGITS_LOSS_FP32_SIMD_H_ +#define MINDSPORE_NNACL_BCE_WITH_LOGITS_LOSS_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/bce_with_logits_loss_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/bce_with_logits_loss_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/bce_with_logits_loss_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/bce_with_logits_loss_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h new file mode 100644 index 00000000..e765b1eb --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/bias_add_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BIAS_ADD_SIMD_H_ +#define MINDSPORE_NNACL_BIAS_ADD_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/bias_add_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/bias_add_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/bias_add_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/bias_add_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h new file mode 100644 index 00000000..93d8ca33 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cast_base_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_CAST_BASE_SIMD_H_ +#define MINDSPORE_NNACL_CAST_BASE_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/cast_base_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/cast_base_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/cast_base_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/cast_base_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h new file mode 100644 index 00000000..70f79645 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cdist_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_CDIST_FP32_SIMD_H_ +#define MINDSPORE_NNACL_CDIST_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/cdist_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/cdist_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/cdist_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/cdist_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h new file mode 100644 index 00000000..b6979626 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/cumsum_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_CUMSUM_FP32_SIMD_H_ +#define MINDSPORE_NNACL_CUMSUM_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/cumsum_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/cumsum_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/cumsum_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/cumsum_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h new file mode 100644 index 00000000..dcae16ff --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/div_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_DIV_FP32_SIMD_H_ +#define MINDSPORE_NNACL_DIV_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/div_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/div_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/div_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/div_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h new file mode 100644 index 00000000..704591c5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/dropout_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_DROPOUT_FP32_SIMD_H_ +#define MINDSPORE_NNACL_DROPOUT_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/dropout_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/dropout_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/dropout_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/dropout_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h new file mode 100644 index 00000000..272f5934 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/exp_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_EXP_FP32_SIMD_H_ +#define MINDSPORE_NNACL_EXP_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/exp_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/exp_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/exp_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/exp_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h new file mode 100644 index 00000000..f3099405 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/fill_base_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FILL_BASE_SIMD_H_ +#define MINDSPORE_NNACL_FILL_BASE_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/fill_base_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/fill_base_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/fill_base_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/fill_base_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h new file mode 100644 index 00000000..a3931c20 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/group_norm_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_GROUP_NORM_FP32_SIMD_H_ +#define MINDSPORE_NNACL_GROUP_NORM_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/group_norm_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/group_norm_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/group_norm_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/group_norm_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h new file mode 100644 index 00000000..c08461d3 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/layer_norm_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_LAYER_NORM_FP32_SIMD_H_ +#define MINDSPORE_NNACL_LAYER_NORM_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/layer_norm_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/layer_norm_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/layer_norm_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/layer_norm_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h new file mode 100644 index 00000000..1250f3fc --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/matmul_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_MATMUL_FP32_SIMD_H_ +#define MINDSPORE_NNACL_MATMUL_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/matmul_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/matmul_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/matmul_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/matmul_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h new file mode 100644 index 00000000..31e08b08 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/mul_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_MUL_FP32_SIMD_H_ +#define MINDSPORE_NNACL_MUL_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/mul_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/mul_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/mul_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/mul_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h new file mode 100644 index 00000000..42d163f6 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_fp32_neon.h @@ -0,0 +1,220 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int Fp32ReluNEON(int index, const float *src, int length, float *dst) { + SIMD_F32 zero = SIMD_SET0_F32; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero)); + } + return index; +} + +static inline int Int32ReluNEON(int index, const int32_t *src, int length, int32_t *dst) { + SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero)); + } + return index; +} + +static inline int Fp32Relu6NEON(int index, const float *src, int length, float *dst) { + SIMD_F32 zero = SIMD_SET0_F32; + SIMD_F32 six = SIMD_MOV_F32(6.0f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six)); + } + return index; +} + +static inline int LReluNEON(int index, const float *src, int length, float *dst, float alpha) { + SIMD_F32 alpha_data = SIMD_MOV_F32(alpha); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask)); + } + return index; +} + +static inline int SigmoidNEON(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index); + SIMD_ST_F32(dst + index, + SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); + } + return index; +} + +static inline int TanhNEON(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + index); + SIMD_ST_F32(dst + index, SIMD_TANH_F32(input)); + } + return index; +} + +static inline int SwishNEON(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index); + SIMD_ST_F32(dst + index, + SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); + } + return index; +} + +static inline int HSwishNEON(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); + SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6)); + } + return index; +} + +static inline int HSigmoidNEON(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); + SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6)); + } + return index; +} + +static inline int HardTanhNoLimitMinNEON(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val)); + } + return index; +} + +static inline int HardTanhNoLimitMaxNEON(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val)); + } + return index; +} + +static inline int HardTanhLimitMinMaxNEON(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val)); + } + return index; +} + +static inline int GeluApproximateNEON(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in = SIMD_LD_F32(src + index); + SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in); + SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in); + SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f))); + } + return index; +} + +static inline int GeluNEON(int index, const float *src, int length, float *dst) { + SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f); + SIMD_F32 para2 = SIMD_MOV_F32(1.0f); + SIMD_F32 para3 = SIMD_MOV_F32(0.5f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in = SIMD_LD_F32(src + index); + SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1)))); + SIMD_ST_F32(dst + index, res); + } + return index; +} + +static inline int EluNEON(int index, const float *src, int length, float *dst, float alpha) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f); + SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); + } + return index; +} + +static inline int CeluNEON(int index, const float *src, int length, float *dst, float alpha) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f); + SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); + } + return index; +} + +static inline int HShrinkNEON(int index, const float *src, int length, float *dst, float lambd) { + const float neg_lambd = -1 * lambd; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd)); + SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp); + SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask)); + } + return index; +} + +static inline int SoftShrinkNEON(int index, const float *src, int length, float *dst, float lambd) { + SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); + SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); + + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_t = SIMD_LD_F32(src + index); + /* v0 = (in > lamdb) & (in - lamdb) */ + SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v)); + /* v1 = (in < -lamdb) & (in + lamdb) */ + SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v)); + /* out = (v0 | v1) */ + SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1)); + } + return index; +} + +static inline int SoftsignFp32OptNEON(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp)); + SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h new file mode 100644 index 00000000..df832e51 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/activation_grad_neon.h @@ -0,0 +1,56 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_NEON_H_ +#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int ShrinkGradNEON(int index, const float *src0, const float *src1, + int length, float *dst, float lambd) { + SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); + SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); + + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src0_t = SIMD_LD_F32(src0 + index); + SIMD_F32 src1_t = SIMD_LD_F32(src1 + index); + + SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v); + SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t); + SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); + + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h new file mode 100644 index 00000000..fda41ec2 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/adam_fp32_neon.h @@ -0,0 +1,209 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_NEON_H_ +#define MINDSPORE_NNACL_FP32_ADAM_FP32_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON +#ifdef MS_SIMD_AVX512 + static inline size_t AdamWeightDecayFp32NEON(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + const float *gradient, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient + index); + + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_F32(var + index, var_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp32Fp16NEON(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); + + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(var + index, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp32Fp32NEON(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); + + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(var + index, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp16Fp16NEON(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); + } + + return index; +} + +static inline size_t FusedCastAdamFp16Fp32NEON(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); + } + + return index; +} +#endif + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h new file mode 100644 index 00000000..4ef32418 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/add_fp32_neon.h @@ -0,0 +1,123 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_ADD_NEON_H_ +#define MINDSPORE_NNACL_FP32_ADD_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int ElementOptAddNEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddIntNEON(int index, const int *in0, const int *in1, int *out, + int size) { + SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddReluNEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddRelu6NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddNEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddReluNEON(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddRelu6NEON(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddIntNEON(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h new file mode 100644 index 00000000..2449c07d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_fp32_neon.h @@ -0,0 +1,253 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_ARITHMETIC_NEON_H_ +#define MINDSPORE_NNACL_ARITHMETIC_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +#ifndef MS_SIMD_NEON +static inline int ElementFloorModNEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorModNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorModNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementFloorDivNEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, floor_tmp); + } + return index; +} + +static inline int ElementOptFloorDivNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} +#endif + +static inline int ElementFloorDivIntNEON(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMaximumNEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMaximumIntNEON(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMinimumIntNEON(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMinimumNEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h new file mode 100644 index 00000000..682148d7 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/arithmetic_self_fp32_neon.h @@ -0,0 +1,128 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_NEON_H_ +#define MINDSPORE_NNACL_ARITHMETIC_SELF_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +#if defined(MS_SIMD_AVX512) +// only avx512 support abs fp32 instruction +static inline int ElementAbsNEON(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +static inline int ElementAbsIntNEON(int index, const int *input, int *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index))); + } + return index; +} +#endif + +static inline int ElementSquareNEON(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin = SIMD_LD_F32(input + index); + SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin)); + } + return index; +} + +static inline int ElementSqrtNEON(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +static inline int ElementRsqrtNEON(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE) +// avx512 dont support round fp32 instruction +static inline int ElementRoundNEON(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +#ifndef MS_SIMD_NEON +// neon dont support floor fp32 instruction +static inline int ElementFloorNEON(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +#ifndef MS_SIMD_NEON +static inline int ElementCeilNEON(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +static inline int ElementNegativeNEON(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f)); + } + return index; +} + +static inline int ElementNegativeIntNEON(int index, const int *input, int *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1)); + } + return index; +} + +static inline int ElementReciprocalNEON(int index, const float *input, float *output, const int element_size) { + SIMD_F32 num1 = SIMD_MOV_F32(1.0f); + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index))); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h new file mode 100644 index 00000000..5e169d62 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/batchnorm_fp32_neon.h @@ -0,0 +1,66 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int BatchNormFp32NEON(int index, const float *input, const float *mean, + const float *variance, int channel, float epsilon, float *output) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_data = SIMD_LD_F32(input + index); + SIMD_F32 mean_ = SIMD_LD_F32(mean + index); + SIMD_F32 variance_ = SIMD_LD_F32(variance + index); + SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); + SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); + SIMD_ST_F32(output + index, output_data); + } + return index; +} + +static inline int FusedBatchNormFp32NEON(int index, const float *input, const float *scale, + const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_data = SIMD_LD_F32(input + index); + SIMD_F32 scale_ = SIMD_LD_F32(scale + index); + SIMD_F32 offset_ = SIMD_LD_F32(offset + index); + SIMD_F32 mean_ = SIMD_LD_F32(mean + index); + SIMD_F32 variance_ = SIMD_LD_F32(variance + index); + SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); + SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); + SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_); + SIMD_ST_F32(output + index, output_data); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h new file mode 100644 index 00000000..3f52857c --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bce_with_logits_loss_fp32_neon.h @@ -0,0 +1,68 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_NEON_H_ +#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int BCEWithLogitLossNEON(int index, const float *logits, const float *label, + const float *weight, const float *pos_weight, int length, bool reduction, float *output, + float *reduction_sum) { + SIMD_F32 zero = SIMD_SET0_F32; + SIMD_F32 ones = SIMD_MOV_F32(1.0f); + SIMD_F32 middle_output = SIMD_SET0_F32; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index); + SIMD_F32 label_tmp = SIMD_LD_F32(label + index); + SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index); + SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index); + SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp); + SIMD_F32 max_value = neg_logits_tmp; + max_value = SIMD_MIN_F32(max_value, zero); + SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value); + SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones); + SIMD_F32 log_exp_value = + SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value)))); + SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp), + SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value))); + if (reduction) { + middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output); + } else { + SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp)); + } + } + if (reduction) { + *reduction_sum += SIMD_GET_SUM_F32(middle_output); + } + return index; +} +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h new file mode 100644 index 00000000..afaf0de5 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/bias_add_neon.h @@ -0,0 +1,63 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_NEON_H_ +#define MINDSPORE_NNACL_FP32_BIAS_ADD_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int BiasAddByInnerCoreNEON(int index, const float *input, const float *bias, float *output, + int64_t num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(input + index); + SIMD_F32 vin1 = SIMD_LD_F32(bias + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); + SIMD_ST_F32(output + index, vout); + } + return index; +} + +static inline int BiasAddByBatchCoreNEON(int index, const float *input, const float *bias, float *output1, + float *output2, float *output3, float *output4, int64_t num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_LDX4_F32(input_data, input + index, num); + SIMD_F32 bias_data = SIMD_LD_F32(bias + index); + SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data)); + SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data)); + SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data)); + SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +}; +#endif + +#endif // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h new file mode 100644 index 00000000..8fe26687 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cast_base_neon.h @@ -0,0 +1,55 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_NEON_H_ +#define MINDSPORE_NNACL_BASE_CAST_BASE_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int Int32ToFloat32NEON(int index, const int32_t *input, float *output, int number) { + for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 value = SIMD_LD_EPI32(input + index); + SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value)); + } + return index; +} + +#ifndef MS_SIMD_NEON +static inline int Float32ToInt32NEON(int index, const float *input, int32_t *output, int number) { + for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(input + index); + SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value)); + } + return index; +} +#endif + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h new file mode 100644 index 00000000..09f55bbf --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cdist_fp32_neon.h @@ -0,0 +1,69 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_CDIST_NEON_H_ +#define MINDSPORE_NNACL_FP32_CDIST_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int64_t CdistTwoNormalOptNEON(int64_t index, const float *a, const float *b, + float *out, int64_t size) { + SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); + for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_vec = SIMD_LD_F32(a + index); + SIMD_F32 b_vec = SIMD_LD_F32(b + index); + SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); + tmp_vec = SIMD_ABS_F32(tmp_vec); + result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec); + } + *out += SIMD_GET_SUM_F32(result_vec); + + return index; +} + +static inline int64_t CdistPNormalOptNEON(int64_t index, const float *a, const float *b, + float *out, int64_t size, float p) { + SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); + SIMD_F32 p_vec = SIMD_MOV_F32(p); + for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_vec = SIMD_LD_F32(a + index); + SIMD_F32 b_vec = SIMD_LD_F32(b + index); + SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); + tmp_vec = SIMD_ABS_F32(tmp_vec); + tmp_vec = SIMD_POW_F32(tmp_vec, p_vec); + result_vec = SIMD_ADD_F32(tmp_vec, result_vec); + } + *out += SIMD_GET_SUM_F32(result_vec); + + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h new file mode 100644 index 00000000..d8a2580a --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/cumsum_fp32_neon.h @@ -0,0 +1,120 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_CUMSUM_NEON_H_ +#define MINDSPORE_NNACL_FP32_CUMSUM_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +// (a, b, c) -> (a, a+b, a+b+c) exclusive == false +// (a, b, c) -> (0, a, a+b) exclusive == true +static inline int64_t CumsumOutputInitWithInputNEON(int64_t index, const float *layer_input, + float *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index)); + } + return index; +} + +static inline int64_t CumsumOutputInitWithZeroNEON(int64_t index, float *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f)); + } + return index; +} + +static inline int64_t CumsumNEON(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_val = SIMD_LD_F32(layer_input + index); + SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index); + SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); + SIMD_ST_F32(layer_output + index, out_val); + } + return index; +} + +// (a, b, c) -> (c+b+a, c+b, c) exclusive==false +// (a, b, c) -> (c+b, c, 0) exclusive==true +static inline int64_t CumsumReverseNEON(int64_t index, const float *layer_input, float *layer_output, + float *layer_last_output, int inner_dim) { + + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1); + SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1); + SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); + SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val); + } + return index; +} + +// (a, b, c) -> (a, a+b, a+b+c) exclusive == false +// (a, b, c) -> (0, a, a+b) exclusive == true +static inline int64_t CumsumIntOutputInitWithInputNEON(int64_t index, const int *layer_input, + int *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index)); + } + return index; +} + +static inline int64_t CumsumIntOutputInitWithZeroNEON(int64_t index, int *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f)); + } + return index; +} + +static inline int64_t CumsumIntNEON(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index); + SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index); + SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); + SIMD_ST_EPI32(layer_output + index, out_val); + } + return index; +} + +// (a, b, c) -> (c+b+a, c+b, c) exclusive==false +// (a, b, c) -> (c+b, c, 0) exclusive==true +static inline int64_t CumsumReverseIntNEON(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1); + SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1); + SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); + SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h new file mode 100644 index 00000000..c4ce6594 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/div_fp32_neon.h @@ -0,0 +1,166 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_DIV_NEON_H_ +#define MINDSPORE_NNACL_FP32_DIV_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int ElementOptDivNum0NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivNum1NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivReluNum0NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivReluNum1NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivRelu6Num0NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivRelu6Num1NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivNEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivIntNEON(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementDivReluNEON(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivRelu6NEON(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h new file mode 100644 index 00000000..b71db336 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/dropout_fp32_neon.h @@ -0,0 +1,45 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_NEON_H_ +#define MINDSPORE_NNACL_FP32_DROPOUTFP32_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int DropoutFp32NEON(int index, const float *input, float scale, + int length, float *output) { + SIMD_F32 scale_value = SIMD_MOV_F32(scale); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value)); + } + return index; +} +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h new file mode 100644 index 00000000..a594abd2 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/exp_fp32_neon.h @@ -0,0 +1,62 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_DIV_NEON_H_ +#define MINDSPORE_NNACL_FP32_DIV_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int64_t ExpFp32NEON(int64_t index, const float *src, float *dst, int num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); + } + return index; +} + +static inline int64_t ExpFp32WithInScaleNEON(int64_t index, const float *src, float *dst, int num, float in_scale) { + SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index); + } + return index; +} + +static inline int64_t ExpFp32WithOutScaleNEON(int64_t index, const float *src, float *dst, int num, float out_scale) { + SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); + SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h new file mode 100644 index 00000000..c467d2d9 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/fill_base_neon.h @@ -0,0 +1,52 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_NEON_H_ +#define MINDSPORE_NNACL_BASE_FILL_BASE_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int FillFp32NEON(int index, float *output, int size, float data) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MOV_F32(data)); + } + return index; +} + +static inline int FillInt32NEON(int index, int *output, int size, int data) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif + diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h new file mode 100644 index 00000000..0eb6c9d2 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/group_norm_fp32_neon.h @@ -0,0 +1,76 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_NEON_H_ +#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int64_t GroupNormFp32NEON(int64_t index, const float *unit_input, float scale, float offset, float mean, + float var_sqrt, int unit, float *unit_output) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt); + SIMD_F32 scale_val = SIMD_MOV_F32(scale); + SIMD_F32 offset_val = SIMD_MOV_F32(offset); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(unit_input + index); + SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt); + SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val); + SIMD_ST_F32(unit_output + index, output); + } + return index; +} + +static inline int64_t GroupNormReduceSumNEON(int64_t index, const float *in, float *sum, int unit) { + if (unit - index >= 4 * BLOCK_NUM) { + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index)); + } + *sum += SIMD_GET_SUM_F32(tmp); + } + return index; +} + +static inline int64_t GroupNormReduceVarNEON(int64_t index, const float *in, float mean, float *sum, int unit) { + if (unit - index >= 4 * BLOCK_NUM) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val); + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input)); + } + *sum += SIMD_GET_SUM_F32(tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h new file mode 100644 index 00000000..0c528616 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/layer_norm_fp32_neon.h @@ -0,0 +1,67 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_NEON_H_ +#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int LayerNormMeanAndSquareNEON(int index, const float *src, int num, float *mean, float *square_mean) { + if (num >= 4 * BLOCK_NUM) { + SIMD_F32 sum_val = SIMD_SET0_F32; + SIMD_F32 square_sum_val = SIMD_SET0_F32; + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(src + index); + SIMD_F32 square_value = SIMD_MUL_F32(value, value); + sum_val = SIMD_ADD_F32(sum_val, value); + square_sum_val = SIMD_ADD_F32(square_sum_val, square_value); + } + *mean += SIMD_GET_SUM_F32(sum_val); + *square_mean += SIMD_GET_SUM_F32(square_sum_val); + } + return index; +} + +static inline int LayerNormGammaAndBetaNEON(int index, float *dst, const float *src, const float *gamma_data, + const float *beta_data, int num, const float mean, const float deno) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 deno_val = SIMD_MOV_F32(deno); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(src + index); + SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val); + out_value = SIMD_MUL_F32(out_value, deno_val); + out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index)); + SIMD_ST_F32(dst + index, out_value); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h new file mode 100644 index 00000000..0e12e5a0 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/matmul_fp32_neon.h @@ -0,0 +1,92 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_NEON_H_ +#define MINDSPORE_NNACL_FP32_MATMUL_F32_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6. +static inline int64_t GemmIsNotPackNEON(int64_t index, const float *a, const float *b, float *c, const float *bias, int row, + int deep, int act_type) { + SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f); + SIMD_F32 up_threshold = SIMD_MOV_F32(6); + SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]); + SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]); + for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_data = SIMD_LD_F32(a + index); + SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16); + if (act_type != 0) { + dst = SIMD_MAX_F32(dst, down_threshold); + if (act_type == 3) { + dst = SIMD_MIN_F32(dst, up_threshold); + } + } + SIMD_ST_F32(c + index, dst); + } + + return index; +} + +#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX) +static inline int64_t GemmIsNotPackOptimizeCoreNEON(int64_t index, const float *a, const float *b, int k, float *dst) { + SIMD_F32 dst1 = SIMD_MOV_F32(0.0f); + for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 weight = SIMD_LD_F32(b + index); + SIMD_F32 a1 = SIMD_LD_F32(a + index); + dst1 = SIMD_FMADD_F32(weight, a1, dst1); + } + *dst += SIMD_REDUCE_ADD_F32(dst1); + return index; +} +#endif + +static inline int64_t MatVecMulNoPackCoreNEON(int64_t oc_index, const float *a, const float *b, float *c, const float *bias, + int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) { + for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) { + SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index)); + for (int64_t k = 0; k < depth; ++k) { + SIMD_F32 left = SIMD_MOV_F32(a[k]); + SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col); + out = SIMD_FMADD_F32(left, right, out); + } + if ((inc_flag & 0x2) != 0 && act_type != 0) { + out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f)); + if (act_type == 0x3) { + out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f)); + } + } + SIMD_ST_F32(c + oc_index, out); + } + return oc_index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h new file mode 100644 index 00000000..33506e0c --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/mul_fp32_neon.h @@ -0,0 +1,217 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int ElementMulNEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulReluNEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulRelu6NEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulIntNEON(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementMulReluIntNEON(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementMulRelu6IntNEON(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluNum0NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluNum1NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6Num0NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6Num1NEON(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6IntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6IntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h new file mode 100644 index 00000000..ea6acf62 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/pooling_fp32_neon.h @@ -0,0 +1,83 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_POOLING_NEON_H_ +#define MINDSPORE_NNACL_FP32_POOLING_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int AvgPoolingBatchNEON(int ci, const float *src_plane_ptr, int channel, + float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, + int in_h_index, int in_w, int in_w_index, float minf, float maxf) { + SIMD_F32 min_val = SIMD_MOV_F32(minf); + SIMD_F32 max_val = SIMD_MOV_F32(maxf); + for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + SIMD_F32 tmp_avg = SIMD_SET0_F32; + int real_count = 0; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = real_win_w_start; w < real_win_w_end; w++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr)); + ++real_count; + } + } + tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count)); + tmp_avg = SIMD_MAX_F32(tmp_avg, min_val); + tmp_avg = SIMD_MIN_F32(tmp_avg, max_val); + SIMD_ST_F32(dst_c_ptr, tmp_avg); + } + return ci; +} + +static inline int MaxPoolingBatchNEON(int ci, const float *src_plane_ptr, int channel, + float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, + int in_h_index, int in_w, int in_w_index, float minf, float maxf) { + SIMD_F32 min_val = SIMD_MOV_F32(minf); + SIMD_F32 max_val = SIMD_MOV_F32(maxf); + for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + SIMD_F32 tmp_max = min_val; + for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { + for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; + tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr)); + } + } + tmp_max = SIMD_MIN_F32(tmp_max, max_val); + SIMD_ST_F32(dst_c_ptr, tmp_max); + } + return ci; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h new file mode 100644 index 00000000..fd8699c7 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/power_fp32_neon.h @@ -0,0 +1,100 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_POWER_NEON_H_ +#define MINDSPORE_NNACL_FP32_POWER_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int PowerBroadCastIntExponentNEON(int index, const float *input, int exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + SIMD_F32 result = SIMD_MOV_F32(1.0f); + int exp = abs(exponent); + while (exp) { + if (exp % 2) { + result = SIMD_MUL_F32(result, tmp); + } + tmp = SIMD_MUL_SQUARE_F32(tmp); + exp = exp / 2; + } + SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result)); + } + return index; +} + +static inline int PowerBroadCastFloatExponentNEON(int index, const float *input, float exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + SIMD_F32 result; + for (int i = 0; i < BLOCK_NUM; ++i) { + SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent); + } + SIMD_ST_F32(output + index, result); + } + return index; +} + +static inline int PowerSingleExponentNEON(int index, const float *input, const float *exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + for (int j = 0; j < BLOCK_NUM; ++j) { + float cur_exponent = exponent[index + j]; + float cur_val = SIMD_F32_GETI(tmp_vec, j); + if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) { + int exp = abs((int)(cur_exponent)); + float result = 1; + while (exp) { + if (exp % 2) { + result *= cur_val; + } + cur_val *= cur_val; + exp = exp / 2; + } + output[index + j] = *exponent >= 0 ? result : 1 / result; + } else { + output[index + j] = powf(cur_val, cur_exponent); + } + } + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h new file mode 100644 index 00000000..7f9153f8 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/reduce_fp32_neon.h @@ -0,0 +1,180 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_NEON_H_ +#define MINDSPORE_NNACL_FP32_REDUCE_FP32_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int64_t ReduceSumNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceMeanNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size)); + } + return index; +} + +static inline int64_t ReduceMinNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceMaxNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceProdNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(1.0f); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceSumSquareNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceL2NormNEON(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); + } + SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp)); + } + return index; +} + +static inline int64_t IntReduceSumNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t IntReduceMeanNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size)); + } + return index; +} + +static inline int64_t IntReduceMinNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t IntReduceMaxNEON(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h new file mode 100644 index 00000000..f116d92f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/softmax_fp32_neon.h @@ -0,0 +1,86 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_NEON_H_ +#define MINDSPORE_NNACL_FP32_SOFTMAX_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int64_t SoftmaxNormGetMaxNEON(int64_t index, const float *src, int cur_batch_offset, + float *max, int channel) { + if (channel >= BLOCK_NUM * BLOCK_NUM) { + SIMD_F32 max_val = SIMD_MOV_F32(*max); + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index)); + } + *max = SIMD_GET_MAX_F32(max_val); + } + return index; +} + +static inline int64_t SoftmaxNormCalcNormNEON(int64_t index, const float *src, float *dst, + int cur_batch_offset, float max, int channel) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max)); + SIMD_ST_F32(dst + cur_batch_offset + index, output); + } + return index; +} + +static inline int64_t SoftmaxLastAxisGetExpSumNEON(int64_t index, const float *src, float *dst, + int cur_batch_offset, float max, float *exp_sum, int channel) { +#ifndef _WIN32 + SIMD_F32 sum_val = SIMD_SET0_F32; + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); + SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max)); + SIMD_F32 exp_out = SIMD_EXP_F32(output); + sum_val = SIMD_ADD_F32(sum_val, exp_out); + SIMD_ST_F32(dst + cur_batch_offset + index, exp_out); + } + *exp_sum += SIMD_GET_SUM_F32(sum_val); +#endif + return index; +} + +static inline int64_t SoftmaxLastAxisGetResultNEON(int64_t index, const float *src, float *dst, + int cur_batch_offset, float exp_sum, int channel) { + SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum); + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); + SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val); + SIMD_ST_F32(dst + cur_batch_offset + index, output); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h new file mode 100644 index 00000000..d2731101 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/neon/sub_fp32_neon.h @@ -0,0 +1,166 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_SUB_NEON_H_ +#define MINDSPORE_NNACL_FP32_SUB_NEON_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_neon_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MS_SIMD_INSTRUCTION MS_SIMD_NEON_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_NEON + +static inline int ElementOptSubNum0NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubNum1NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubIntNum0NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubIntNum1NEON(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubReluNum0NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubReluNum1NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubRelu6Num0NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubRelu6Num1NEON(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubNEON(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubIntNEON(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementSubReluNEON(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubRelu6NEON(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM + +#undef MS_SIMD_NEON +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h new file mode 100644 index 00000000..75bda800 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/pooling_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_POOLING_FP32_SIMD_H_ +#define MINDSPORE_NNACL_POOLING_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/pooling_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/pooling_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/pooling_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/pooling_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h new file mode 100644 index 00000000..15e9f009 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/power_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_POWER_FP32_SIMD_H_ +#define MINDSPORE_NNACL_POWER_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/power_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/power_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/power_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/power_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h new file mode 100644 index 00000000..60d0cd85 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/reduce_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_REDUCE_FP32_SIMD_H_ +#define MINDSPORE_NNACL_REDUCE_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/reduce_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/reduce_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/reduce_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/reduce_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h new file mode 100644 index 00000000..524668ab --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/softmax_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_SOFTMAX_FP32_SIMD_H_ +#define MINDSPORE_NNACL_SOFTMAX_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/softmax_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/softmax_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/softmax_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/softmax_fp32_neon.h" +#endif + +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h new file mode 100644 index 00000000..192fc66d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_fp32_sse.h @@ -0,0 +1,221 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int Fp32ReluSSE(int index, const float *src, int length, float *dst) { + SIMD_F32 zero = SIMD_SET0_F32; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MAX_F32(SIMD_LD_F32(src + index), zero)); + } + return index; +} + +static inline int Int32ReluSSE(int index, const int32_t *src, int length, int32_t *dst) { + SIMD_EPI32 zero = SIMD_MOV_EPI32(0.0f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(dst + index, SIMD_MAX_EPI32(SIMD_LD_EPI32(src + index), zero)); + } + return index; +} + +static inline int Fp32Relu6SSE(int index, const float *src, int length, float *dst) { + SIMD_F32 zero = SIMD_SET0_F32; + SIMD_F32 six = SIMD_MOV_F32(6.0f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_CLAMP_F32(SIMD_LD_F32(src + index), zero, six)); + } + return index; +} + +static inline int LReluSSE(int index, const float *src, int length, float *dst, float alpha) { + SIMD_F32 alpha_data = SIMD_MOV_F32(alpha); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_MASK mask = SIMD_CMPGT_F32(SIMD_SET0_F32, src_tmp); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_F32(src_tmp, alpha_data), mask)); + } + return index; +} + +static inline int SigmoidSSE(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, (SIMD_LD_F32(src + index))), dst + index); + SIMD_ST_F32(dst + index, + SIMD_DIV_F32(SIMD_MOV_F32(1.0f), SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); + } + return index; +} + +static inline int TanhSSE(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + index); + SIMD_ST_F32(dst + index, SIMD_TANH_F32(input)); + } + return index; +} + +static inline int SwishSSE(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_EXP_ST_F32(SIMD_SUB_F32(SIMD_SET0_F32, src_value), dst + index); + SIMD_ST_F32(dst + index, + SIMD_DIV_F32(src_value, SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_LD_F32(dst + index)))); + } + return index; +} + +static inline int HSwishSSE(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); + SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(SIMD_MUL_F32(src_value, relu6), 6)); + } + return index; +} + +static inline int HSigmoidSSE(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_value = SIMD_LD_F32(src + index); + SIMD_F32 relu6 = SIMD_CLAMP_N_F32(SIMD_ADD_N_F32(src_value, 3), 0, 6); + SIMD_ST_F32(dst + index, SIMD_DIV_N_F32(relu6, 6)); + } + return index; +} + +static inline int HardTanhNoLimitMinSSE(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MIN_N_F32(SIMD_LD_F32(src + index), max_val)); + } + return index; +} + +static inline int HardTanhNoLimitMaxSSE(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_MAX_N_F32(SIMD_LD_F32(src + index), min_val)); + } + return index; +} + +static inline int HardTanhLimitMinMaxSSE(int index, const float *src, int length, float *dst, float min_val, + float max_val) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(dst + index, SIMD_CLAMP_N_F32(SIMD_LD_F32(src + index), min_val, max_val)); + } + return index; +} + +static inline int GeluApproximateSSE(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in = SIMD_LD_F32(src + index); + SIMD_F32 tmp1 = SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.035677408136f), in); + SIMD_F32 tmp2 = SIMD_MUL_F32(SIMD_ADD_N_F32(tmp1, 0.79788456080287f), in); + SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_MUL_N_F32(in, 0.5f), SIMD_ADD_N_F32(SIMD_TANH_F32(tmp2), 1.0f))); + } + return index; +} + +static inline int GeluSSE(int index, const float *src, int length, float *dst) { + SIMD_F32 para1 = SIMD_MOV_F32(1.4142135623730951f); + SIMD_F32 para2 = SIMD_MOV_F32(1.0f); + SIMD_F32 para3 = SIMD_MOV_F32(0.5f); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in = SIMD_LD_F32(src + index); + SIMD_F32 res = SIMD_MUL_F32(SIMD_MUL_F32(para3, in), SIMD_ADD_F32(para2, SIMD_ERF_F32(SIMD_DIV_F32(in, para1)))); + SIMD_ST_F32(dst + index, res); + } + return index; +} + +static inline int EluSSE(int index, const float *src, int length, float *dst, float alpha) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(src_tmp), 1.0f); + SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); + } + return index; +} + +static inline int CeluSSE(int index, const float *src, int length, float *dst, float alpha) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 exp_tmp = SIMD_SUB_N_F32(SIMD_EXP_F32(SIMD_DIV_N_F32(src_tmp, alpha)), 1.0f); + SIMD_MASK mask = SIMD_CMPLE_F32(src_tmp, SIMD_SET0_F32); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MUL_N_F32(exp_tmp, alpha), mask)); + } + return index; +} + +static inline int HShrinkSSE(int index, const float *src, int length, float *dst, float lambd) { + const float neg_lambd = -1 * lambd; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_MASK mask0 = SIMD_CMPLE_F32(src_tmp, SIMD_MOV_F32(lambd)); + SIMD_MASK mask1 = SIMD_CMPLE_F32(SIMD_MOV_F32(neg_lambd), src_tmp); + SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src_tmp, SIMD_MOV_F32(0.0f), mask)); + } + return index; +} + +static inline int SoftShrinkSSE(int index, const float *src, int length, float *dst, float lambd) { + SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); + SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); + + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_t = SIMD_LD_F32(src + index); + /* v0 = (in > lamdb) & (in - lamdb) */ + SIMD_F32 value0 = SIMD_AND_MASK_F32(SIMD_CMPGT_F32(src_t, pos_lamdb_v), SIMD_SUB_F32(src_t, pos_lamdb_v)); + /* v1 = (in < -lamdb) & (in + lamdb) */ + SIMD_F32 value1 = SIMD_AND_MASK_F32(SIMD_CMPLT_F32(src_t, neg_lamdb_v), SIMD_ADD_F32(src_t, pos_lamdb_v)); + /* out = (v0 | v1) */ + SIMD_ST_F32(dst + index, SIMD_OR_F32(value0, value1)); + } + return index; +} + +static inline int SoftsignFp32OptSSE(int index, const float *src, int length, float *dst) { + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src_tmp = SIMD_LD_F32(src + index); + SIMD_F32 divisor_tmp = SIMD_ADD_F32(SIMD_MOV_F32(1.0f), SIMD_ABS_F32(src_tmp)); + SIMD_ST_F32(dst + index, SIMD_DIV_F32(src_tmp, divisor_tmp)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h new file mode 100644 index 00000000..85996f69 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/activation_grad_sse.h @@ -0,0 +1,57 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_SSE_H_ +#define MINDSPORE_NNACL_FP32_GRAD_ACTIVATION_GRAD_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int ShrinkGradSSE(int index, const float *src0, const float *src1, + int length, float *dst, float lambd) { + SIMD_F32 pos_lamdb_v = SIMD_MOV_F32(lambd); + SIMD_F32 neg_lamdb_v = SIMD_MOV_F32(-lambd); + + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 src0_t = SIMD_LD_F32(src0 + index); + SIMD_F32 src1_t = SIMD_LD_F32(src1 + index); + + SIMD_MASK mask0 = SIMD_CMPLE_F32(src1_t, pos_lamdb_v); + SIMD_MASK mask1 = SIMD_CMPLE_F32(neg_lamdb_v, src1_t); + SIMD_MASK mask = SIMD_AND_MASK(mask0, mask1); + + SIMD_ST_F32(dst + index, SIMD_BLEND_F32(src0_t, SIMD_MOV_F32(0.0f), mask)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h new file mode 100644 index 00000000..1f5291a4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/adam_fp32_sse.h @@ -0,0 +1,210 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ADAM_FP32_SSE_H_ +#define MINDSPORE_NNACL_FP32_ADAM_FP32_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE +#ifdef MS_SIMD_AVX512 + static inline size_t AdamWeightDecayFp32SSE(size_t index, float *var, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + const float *gradient, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient + index); + + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_F32(var + index, var_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp32Fp16SSE(size_t index, float *var, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); + + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(var + index, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp32Fp32SSE(size_t index, float *var, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_LD_F32(var + index); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); + + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(var + index, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + } + + return index; +} + +static inline size_t FusedCastAdamFp16Fp16SSE(size_t index, int16_t *var16, const int16_t *gradient16, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(gradient16 + index)); + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); + } + + return index; +} + +static inline size_t FusedCastAdamFp16Fp32SSE(size_t index, int16_t *var16, const float *gradient32, float *m, float *v, float lr, float beta1, float beta2, float epsilon, float decay, + float global_norm_reciprocal, size_t end) { + SIMD_F32 beta1_r = SIMD_MOV_F32(beta1); + SIMD_F32 beta2_r = SIMD_MOV_F32(beta2); + SIMD_F32 beta1_minus_r = SIMD_MOV_F32(1.0f - beta1); + SIMD_F32 beta2_minus_r = SIMD_MOV_F32(1.0f - beta2); + SIMD_F32 lr_neg_r = SIMD_MOV_F32(-lr); + SIMD_F32 epsilon_r = SIMD_MOV_F32(epsilon); + SIMD_F32 decay_r = SIMD_MOV_F32(decay); + SIMD_F32 global_norm_reciprocal_r = SIMD_MOV_F32(global_norm_reciprocal); + + for (size_t block_max_size = end - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 var_r = SIMD_F16_TO_F32(SIMD_LD_HALF_EPI32(var16)); + SIMD_F32 m_r = SIMD_LD_F32(m + index); + SIMD_F32 v_r = SIMD_LD_F32(v + index); + SIMD_F32 g_r = SIMD_LD_F32(gradient32 + index); + g_r = SIMD_MUL_F32(g_r, global_norm_reciprocal_r); + m_r = SIMD_MUL_F32(m_r, beta1_r); + v_r = SIMD_MUL_F32(v_r, beta2_r); + SIMD_F32 avx_r0 = SIMD_MUL_F32(g_r, g_r); + m_r = SIMD_FMADD_F32(g_r, beta1_minus_r, m_r); + v_r = SIMD_FMADD_F32(avx_r0, beta2_minus_r, v_r); + avx_r0 = SIMD_SQRT_F32(v_r); + avx_r0 = SIMD_DIV_F32(m_r, SIMD_ADD_F32(avx_r0, epsilon_r)); + avx_r0 = SIMD_FMADD_F32(var_r, decay_r, avx_r0); + var_r = SIMD_FMADD_F32(avx_r0, lr_neg_r, var_r); + SIMD_ST_F32(m + index, m_r); + SIMD_ST_F32(v + index, v_r); + SIMD_ST_HALF_EPI32(var16 + index, SIMD_F32_TO_F16(var_r, 0)); + } + + return index; +} +#endif + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h new file mode 100644 index 00000000..eb705534 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/add_fp32_sse.h @@ -0,0 +1,124 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_ADD_SSE_H_ +#define MINDSPORE_NNACL_FP32_ADD_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int ElementOptAddSSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0_, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddIntSSE(int index, const int *in0, const int *in1, int *out, + int size) { + SIMD_EPI32 vin0_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0_, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddReluSSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptAddRelu6SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0_, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddSSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddReluSSE(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddRelu6SSE(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_ADD_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementAddIntSSE(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_ADD_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h new file mode 100644 index 00000000..173890b4 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_fp32_sse.h @@ -0,0 +1,254 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_ARITHMETIC_SSE_H_ +#define MINDSPORE_NNACL_ARITHMETIC_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +#ifndef MS_SIMD_NEON +static inline int ElementFloorModSSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorModNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorModNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_F32 out_tmp = SIMD_SUB_F32(in0_tmp, SIMD_MUL_F32(floor_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementFloorDivSSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 floor_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, floor_tmp); + } + return index; +} + +static inline int ElementOptFloorDivNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_FLOOR_F32(SIMD_DIV_F32(in0_tmp, in1_tmp)); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} +#endif + +static inline int ElementFloorDivIntSSE(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptFloorDivIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_DIV_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMaximumSSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_MAX_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMaximumIntSSE(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMaximumIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_MAX_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMinimumIntSSE(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in0_tmp = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in1_tmp = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 in1_tmp = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 in0_tmp = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 out_tmp = SIMD_MIN_EPI32(in0_tmp, in1_tmp); + SIMD_ST_EPI32(out + index, out_tmp); + } + return index; +} + +static inline int ElementMinimumSSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in0_tmp = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in1_tmp = SIMD_LD_F32(in1 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +static inline int ElementOptMinimumNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 in1_tmp = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 in0_tmp = SIMD_LD_F32(in0 + index); + SIMD_F32 out_tmp = SIMD_MIN_F32(in0_tmp, in1_tmp); + SIMD_ST_F32(out + index, out_tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h new file mode 100644 index 00000000..0a1d21c2 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/arithmetic_self_fp32_sse.h @@ -0,0 +1,129 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_ARITHMETIC_SELF_SSE_H_ +#define MINDSPORE_NNACL_ARITHMETIC_SELF_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +#if defined(MS_SIMD_AVX512) +// only avx512 support abs fp32 instruction +static inline int ElementAbsSSE(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_ABS_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +static inline int ElementAbsIntSSE(int index, const int *input, int *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_ABS_EPI32(SIMD_LD_EPI32(input + index))); + } + return index; +} +#endif + +static inline int ElementSquareSSE(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin = SIMD_LD_F32(input + index); + SIMD_ST_F32(output + index, SIMD_MUL_F32(vin, vin)); + } + return index; +} + +static inline int ElementSqrtSSE(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_SQRT_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +static inline int ElementRsqrtSSE(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_RSQRT_F32(SIMD_LD_F32(input + index))); + } + return index; +} + +#if defined(MS_SIMD_AVX) || defined(MS_SIMD_SSE) +// avx512 dont support round fp32 instruction +static inline int ElementRoundSSE(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_ROUND_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +#ifndef MS_SIMD_NEON +// neon dont support floor fp32 instruction +static inline int ElementFloorSSE(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_FLOOR_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +#ifndef MS_SIMD_NEON +static inline int ElementCeilSSE(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_CEIL_F32(SIMD_LD_F32(input + index))); + } + return index; +} +#endif + +static inline int ElementNegativeSSE(int index, const float *input, float *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MUL_N_F32(SIMD_LD_F32(input + index), -1.0f)); + } + return index; +} + +static inline int ElementNegativeIntSSE(int index, const int *input, int *output, const int element_size) { + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_MUL_N_EPI32(SIMD_LD_EPI32(input + index), -1)); + } + return index; +} + +static inline int ElementReciprocalSSE(int index, const float *input, float *output, const int element_size) { + SIMD_F32 num1 = SIMD_MOV_F32(1.0f); + for (int block_max_size = element_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_DIV_F32(num1, SIMD_LD_F32(input + index))); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h new file mode 100644 index 00000000..f04b4e1f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/batchnorm_fp32_sse.h @@ -0,0 +1,67 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int BatchNormFp32SSE(int index, const float *input, const float *mean, + const float *variance, int channel, float epsilon, float *output) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_data = SIMD_LD_F32(input + index); + SIMD_F32 mean_ = SIMD_LD_F32(mean + index); + SIMD_F32 variance_ = SIMD_LD_F32(variance + index); + SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); + SIMD_F32 output_data = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); + SIMD_ST_F32(output + index, output_data); + } + return index; +} + +static inline int FusedBatchNormFp32SSE(int index, const float *input, const float *scale, + const float *offset, const float *mean, const float *variance, int channel, float epsilon, float *output) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_data = SIMD_LD_F32(input + index); + SIMD_F32 scale_ = SIMD_LD_F32(scale + index); + SIMD_F32 offset_ = SIMD_LD_F32(offset + index); + SIMD_F32 mean_ = SIMD_LD_F32(mean + index); + SIMD_F32 variance_ = SIMD_LD_F32(variance + index); + SIMD_F32 variance_sqrt = SIMD_SQRT_F32(SIMD_ADD_F32(variance_, SIMD_MOV_F32(epsilon))); + SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input_data, mean_), variance_sqrt); + SIMD_F32 output_data = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_), offset_); + SIMD_ST_F32(output + index, output_data); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h new file mode 100644 index 00000000..c929ccaf --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bce_with_logits_loss_fp32_sse.h @@ -0,0 +1,69 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_SSE_H_ +#define MINDSPORE_NNACL_FP32_BCE_WITH_LOGITS_LOSS_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int BCEWithLogitLossSSE(int index, const float *logits, const float *label, + const float *weight, const float *pos_weight, int length, bool reduction, float *output, + float *reduction_sum) { + SIMD_F32 zero = SIMD_SET0_F32; + SIMD_F32 ones = SIMD_MOV_F32(1.0f); + SIMD_F32 middle_output = SIMD_SET0_F32; + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 logits_tmp = SIMD_LD_F32(logits + index); + SIMD_F32 label_tmp = SIMD_LD_F32(label + index); + SIMD_F32 weight_tmp = SIMD_LD_F32(weight + index); + SIMD_F32 pos_weight_tmp = SIMD_LD_F32(pos_weight + index); + SIMD_F32 neg_logits_tmp = SIMD_SUB_F32(zero, logits_tmp); + SIMD_F32 max_value = neg_logits_tmp; + max_value = SIMD_MIN_F32(max_value, zero); + SIMD_F32 neg_max_value = SIMD_SUB_F32(zero, max_value); + SIMD_F32 log_weight = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(pos_weight_tmp, ones), label_tmp), ones); + SIMD_F32 log_exp_value = + SIMD_LOG_F32(SIMD_ADD_F32(SIMD_HEXP_F32(neg_max_value), SIMD_HEXP_F32(SIMD_SUB_F32(neg_logits_tmp, max_value)))); + SIMD_F32 loss = SIMD_ADD_F32(SIMD_MUL_F32(SIMD_SUB_F32(ones, label_tmp), logits_tmp), + SIMD_MUL_F32(log_weight, SIMD_ADD_F32(log_exp_value, max_value))); + if (reduction) { + middle_output = SIMD_FMADD_F32(loss, weight_tmp, middle_output); + } else { + SIMD_ST_F32(output + index, SIMD_MUL_F32(loss, weight_tmp)); + } + } + if (reduction) { + *reduction_sum += SIMD_GET_SUM_F32(middle_output); + } + return index; +} +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h new file mode 100644 index 00000000..0544d239 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/bias_add_sse.h @@ -0,0 +1,64 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_BIAS_ADD_SSE_H_ +#define MINDSPORE_NNACL_FP32_BIAS_ADD_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int BiasAddByInnerCoreSSE(int index, const float *input, const float *bias, float *output, + int64_t num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(input + index); + SIMD_F32 vin1 = SIMD_LD_F32(bias + index); + SIMD_F32 vout = SIMD_ADD_F32(vin0, vin1); + SIMD_ST_F32(output + index, vout); + } + return index; +} + +static inline int BiasAddByBatchCoreSSE(int index, const float *input, const float *bias, float *output1, + float *output2, float *output3, float *output4, int64_t num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_LDX4_F32(input_data, input + index, num); + SIMD_F32 bias_data = SIMD_LD_F32(bias + index); + SIMD_ST_F32(output1 + index, SIMD_ADD_F32(input_data1, bias_data)); + SIMD_ST_F32(output2 + index, SIMD_ADD_F32(input_data2, bias_data)); + SIMD_ST_F32(output3 + index, SIMD_ADD_F32(input_data3, bias_data)); + SIMD_ST_F32(output4 + index, SIMD_ADD_F32(input_data4, bias_data)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +}; +#endif + +#endif // MINDSPORE_NNACL_FP32_BIAS_ADD_SIMD_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h new file mode 100644 index 00000000..4eca209f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cast_base_sse.h @@ -0,0 +1,56 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BASE_CAST_BASE_SSE_H_ +#define MINDSPORE_NNACL_BASE_CAST_BASE_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int Int32ToFloat32SSE(int index, const int32_t *input, float *output, int number) { + for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 value = SIMD_LD_EPI32(input + index); + SIMD_ST_F32(output + index, SIMD_EPI32_TO_F32(value)); + } + return index; +} + +#ifndef MS_SIMD_NEON +static inline int Float32ToInt32SSE(int index, const float *input, int32_t *output, int number) { + for (int block_max_size = number - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(input + index); + SIMD_ST_EPI32(output + index, SIMD_F32_TO_EPI32(value)); + } + return index; +} +#endif + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h new file mode 100644 index 00000000..3d116113 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cdist_fp32_sse.h @@ -0,0 +1,70 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_CDIST_SSE_H_ +#define MINDSPORE_NNACL_FP32_CDIST_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int64_t CdistTwoNormalOptSSE(int64_t index, const float *a, const float *b, + float *out, int64_t size) { + SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); + for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_vec = SIMD_LD_F32(a + index); + SIMD_F32 b_vec = SIMD_LD_F32(b + index); + SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); + tmp_vec = SIMD_ABS_F32(tmp_vec); + result_vec = SIMD_FMADD_F32(tmp_vec, tmp_vec, result_vec); + } + *out += SIMD_GET_SUM_F32(result_vec); + + return index; +} + +static inline int64_t CdistPNormalOptSSE(int64_t index, const float *a, const float *b, + float *out, int64_t size, float p) { + SIMD_F32 result_vec = SIMD_MOV_F32(0.0f); + SIMD_F32 p_vec = SIMD_MOV_F32(p); + for (int64_t block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_vec = SIMD_LD_F32(a + index); + SIMD_F32 b_vec = SIMD_LD_F32(b + index); + SIMD_F32 tmp_vec = SIMD_SUB_F32(a_vec, b_vec); + tmp_vec = SIMD_ABS_F32(tmp_vec); + tmp_vec = SIMD_POW_F32(tmp_vec, p_vec); + result_vec = SIMD_ADD_F32(tmp_vec, result_vec); + } + *out += SIMD_GET_SUM_F32(result_vec); + + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h new file mode 100644 index 00000000..1b67143f --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/cumsum_fp32_sse.h @@ -0,0 +1,121 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_CUMSUM_SSE_H_ +#define MINDSPORE_NNACL_FP32_CUMSUM_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +// (a, b, c) -> (a, a+b, a+b+c) exclusive == false +// (a, b, c) -> (0, a, a+b) exclusive == true +static inline int64_t CumsumOutputInitWithInputSSE(int64_t index, const float *layer_input, + float *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(layer_output + index, SIMD_LD_F32(layer_input + index)); + } + return index; +} + +static inline int64_t CumsumOutputInitWithZeroSSE(int64_t index, float *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(layer_output + index, SIMD_MOV_F32(0.0f)); + } + return index; +} + +static inline int64_t CumsumSSE(int64_t index, const float *layer_input, float *layer_output, float *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_val = SIMD_LD_F32(layer_input + index); + SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output + index); + SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); + SIMD_ST_F32(layer_output + index, out_val); + } + return index; +} + +// (a, b, c) -> (c+b+a, c+b, c) exclusive==false +// (a, b, c) -> (c+b, c, 0) exclusive==true +static inline int64_t CumsumReverseSSE(int64_t index, const float *layer_input, float *layer_output, + float *layer_last_output, int inner_dim) { + + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input_val = SIMD_LD_F32(layer_input - index - BLOCK_NUM + 1); + SIMD_F32 last_output_val = SIMD_LD_F32(layer_last_output - index - BLOCK_NUM + 1); + SIMD_F32 out_val = SIMD_ADD_F32(input_val, last_output_val); + SIMD_ST_F32(layer_output - index - BLOCK_NUM + 1, out_val); + } + return index; +} + +// (a, b, c) -> (a, a+b, a+b+c) exclusive == false +// (a, b, c) -> (0, a, a+b) exclusive == true +static inline int64_t CumsumIntOutputInitWithInputSSE(int64_t index, const int *layer_input, + int *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(layer_output + index, SIMD_LD_EPI32(layer_input + index)); + } + return index; +} + +static inline int64_t CumsumIntOutputInitWithZeroSSE(int64_t index, int *layer_output, int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(layer_output + index, SIMD_MOV_EPI32(0.0f)); + } + return index; +} + +static inline int64_t CumsumIntSSE(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input + index); + SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output + index); + SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); + SIMD_ST_EPI32(layer_output + index, out_val); + } + return index; +} + +// (a, b, c) -> (c+b+a, c+b, c) exclusive==false +// (a, b, c) -> (c+b, c, 0) exclusive==true +static inline int64_t CumsumReverseIntSSE(int64_t index, const int *layer_input, int *layer_output, int *layer_last_output, + int inner_dim) { + for (int block_max_size = inner_dim - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 input_val = SIMD_LD_EPI32(layer_input - index - BLOCK_NUM + 1); + SIMD_EPI32 last_output_val = SIMD_LD_EPI32(layer_last_output - index - BLOCK_NUM + 1); + SIMD_EPI32 out_val = SIMD_ADD_EPI32(input_val, last_output_val); + SIMD_ST_EPI32(layer_output - index - BLOCK_NUM + 1, out_val); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h new file mode 100644 index 00000000..5f0c6009 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/div_fp32_sse.h @@ -0,0 +1,167 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_DIV_SSE_H_ +#define MINDSPORE_NNACL_FP32_DIV_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int ElementOptDivNum0SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0_opt, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivNum1SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0_opt, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivReluNum0SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivReluNum1SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivRelu6Num0SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0_opt, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptDivRelu6Num1SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivSSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_DIV_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivIntSSE(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_DIV_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementDivReluSSE(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementDivRelu6SSE(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_DIV_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h new file mode 100644 index 00000000..2429ed38 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/dropout_fp32_sse.h @@ -0,0 +1,46 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_DROPOUTFP32_SSE_H_ +#define MINDSPORE_NNACL_FP32_DROPOUTFP32_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int DropoutFp32SSE(int index, const float *input, float scale, + int length, float *output) { + SIMD_F32 scale_value = SIMD_MOV_F32(scale); + for (int block_max_size = length - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MUL_F32(SIMD_LD_F32(input + index), scale_value)); + } + return index; +} +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h new file mode 100644 index 00000000..3d802fb3 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/exp_fp32_sse.h @@ -0,0 +1,63 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_DIV_SSE_H_ +#define MINDSPORE_NNACL_FP32_DIV_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int64_t ExpFp32SSE(int64_t index, const float *src, float *dst, int num) { + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); + } + return index; +} + +static inline int64_t ExpFp32WithInScaleSSE(int64_t index, const float *src, float *dst, int num, float in_scale) { + SIMD_F32 scale_vec = SIMD_MOV_F32(in_scale); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_MUL_F32(SIMD_LD_F32(src + index), scale_vec), dst + index); + } + return index; +} + +static inline int64_t ExpFp32WithOutScaleSSE(int64_t index, const float *src, float *dst, int num, float out_scale) { + SIMD_F32 scale_vec = SIMD_MOV_F32(out_scale); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EXP_ST_F32(SIMD_LD_F32(src + index), dst + index); + SIMD_ST_F32(dst + index, SIMD_MUL_F32(SIMD_LD_F32(dst + index), scale_vec)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h new file mode 100644 index 00000000..9c71eefb --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/fill_base_sse.h @@ -0,0 +1,53 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_BASE_FILL_BASE_SSE_H_ +#define MINDSPORE_NNACL_BASE_FILL_BASE_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int FillFp32SSE(int index, float *output, int size, float data) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_F32(output + index, SIMD_MOV_F32(data)); + } + return index; +} + +static inline int FillInt32SSE(int index, int *output, int size, int data) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_ST_EPI32(output + index, SIMD_MOV_EPI32(data)); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif + diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h new file mode 100644 index 00000000..1c1f57da --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/group_norm_fp32_sse.h @@ -0,0 +1,77 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_SSE_H_ +#define MINDSPORE_NNACL_FP32_GROUP_NORM_FP32_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int64_t GroupNormFp32SSE(int64_t index, const float *unit_input, float scale, float offset, float mean, + float var_sqrt, int unit, float *unit_output) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 v_sqrt = SIMD_MOV_F32(var_sqrt); + SIMD_F32 scale_val = SIMD_MOV_F32(scale); + SIMD_F32 offset_val = SIMD_MOV_F32(offset); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(unit_input + index); + SIMD_F32 norm_val = SIMD_DIV_F32(SIMD_SUB_F32(input, mean_val), v_sqrt); + SIMD_F32 output = SIMD_ADD_F32(SIMD_MUL_F32(norm_val, scale_val), offset_val); + SIMD_ST_F32(unit_output + index, output); + } + return index; +} + +static inline int64_t GroupNormReduceSumSSE(int64_t index, const float *in, float *sum, int unit) { + if (unit - index >= 4 * BLOCK_NUM) { + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(in + index)); + } + *sum += SIMD_GET_SUM_F32(tmp); + } + return index; +} + +static inline int64_t GroupNormReduceVarSSE(int64_t index, const float *in, float mean, float *sum, int unit) { + if (unit - index >= 4 * BLOCK_NUM) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int block_max_size = unit - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_SUB_F32(SIMD_LD_F32(in + index), mean_val); + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_F32(input, input)); + } + *sum += SIMD_GET_SUM_F32(tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h new file mode 100644 index 00000000..30af87c3 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/layer_norm_fp32_sse.h @@ -0,0 +1,68 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_SSE_H_ +#define MINDSPORE_NNACL_FP32_LAYER_NORM_FP32_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int LayerNormMeanAndSquareSSE(int index, const float *src, int num, float *mean, float *square_mean) { + if (num >= 4 * BLOCK_NUM) { + SIMD_F32 sum_val = SIMD_SET0_F32; + SIMD_F32 square_sum_val = SIMD_SET0_F32; + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(src + index); + SIMD_F32 square_value = SIMD_MUL_F32(value, value); + sum_val = SIMD_ADD_F32(sum_val, value); + square_sum_val = SIMD_ADD_F32(square_sum_val, square_value); + } + *mean += SIMD_GET_SUM_F32(sum_val); + *square_mean += SIMD_GET_SUM_F32(square_sum_val); + } + return index; +} + +static inline int LayerNormGammaAndBetaSSE(int index, float *dst, const float *src, const float *gamma_data, + const float *beta_data, int num, const float mean, const float deno) { + SIMD_F32 mean_val = SIMD_MOV_F32(mean); + SIMD_F32 deno_val = SIMD_MOV_F32(deno); + for (int block_max_size = num - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 value = SIMD_LD_F32(src + index); + SIMD_F32 out_value = SIMD_SUB_F32(value, mean_val); + out_value = SIMD_MUL_F32(out_value, deno_val); + out_value = SIMD_FMADD_F32(out_value, SIMD_LD_F32(gamma_data + index), SIMD_LD_F32(beta_data + index)); + SIMD_ST_F32(dst + index, out_value); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h new file mode 100644 index 00000000..aef5b2a1 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/matmul_fp32_sse.h @@ -0,0 +1,93 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_MATMUL_F32_SSE_H_ +#define MINDSPORE_NNACL_FP32_MATMUL_F32_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +// act_type must be 0, 1, 2. 0: no_act, 1: relu, 3: relu6. +static inline int64_t GemmIsNotPackSSE(int64_t index, const float *a, const float *b, float *c, const float *bias, int row, + int deep, int act_type) { + SIMD_F32 down_threshold = SIMD_MOV_F32(0.0f); + SIMD_F32 up_threshold = SIMD_MOV_F32(6); + SIMD_F32 b_data16 = SIMD_MOV_F32(b[0]); + SIMD_F32 bias_data16 = SIMD_MOV_F32(bias[0]); + for (int block_max_size = row - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 a_data = SIMD_LD_F32(a + index); + SIMD_F32 dst = SIMD_FMADD_F32(b_data16, a_data, bias_data16); + if (act_type != 0) { + dst = SIMD_MAX_F32(dst, down_threshold); + if (act_type == 3) { + dst = SIMD_MIN_F32(dst, up_threshold); + } + } + SIMD_ST_F32(c + index, dst); + } + + return index; +} + +#if defined(MS_SIMD_AVX512) || defined(MS_SIMD_AVX) +static inline int64_t GemmIsNotPackOptimizeCoreSSE(int64_t index, const float *a, const float *b, int k, float *dst) { + SIMD_F32 dst1 = SIMD_MOV_F32(0.0f); + for (int block_max_size = k - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 weight = SIMD_LD_F32(b + index); + SIMD_F32 a1 = SIMD_LD_F32(a + index); + dst1 = SIMD_FMADD_F32(weight, a1, dst1); + } + *dst += SIMD_REDUCE_ADD_F32(dst1); + return index; +} +#endif + +static inline int64_t MatVecMulNoPackCoreSSE(int64_t oc_index, const float *a, const float *b, float *c, const float *bias, + int act_type, int64_t depth, int64_t oc, int64_t col, int64_t inc_flag) { + for (int64_t oc_max_size = oc - BLOCK_NUM; oc_index <= oc_max_size; oc_index += BLOCK_NUM) { + SIMD_F32 out = (inc_flag & 0x1) == 0 ? SIMD_LD_F32(c + oc_index) : (bias == NULL ? SIMD_MOV_F32(0.0f) : SIMD_LD_F32(bias + oc_index)); + for (int64_t k = 0; k < depth; ++k) { + SIMD_F32 left = SIMD_MOV_F32(a[k]); + SIMD_F32 right = SIMD_LD_F32(b + oc_index + k * col); + out = SIMD_FMADD_F32(left, right, out); + } + if ((inc_flag & 0x2) != 0 && act_type != 0) { + out = SIMD_MAX_F32(out, SIMD_MOV_F32(0.0f)); + if (act_type == 0x3) { + out = SIMD_MIN_F32(out, SIMD_MOV_F32(6.0f)); + } + } + SIMD_ST_F32(c + oc_index, out); + } + return oc_index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h new file mode 100644 index 00000000..e3dd4582 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/mul_fp32_sse.h @@ -0,0 +1,218 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ +#define MINDSPORE_NNACL_FP32_ACTIVATION_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int ElementMulSSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulReluSSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulRelu6SSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementMulIntSSE(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementMulReluIntSSE(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementMulRelu6IntSSE(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0_opt_, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MUL_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluNum0SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluNum1SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6Num0SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin0_opt_ = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0_opt_, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6Num1SSE(int index, const float *in0, const float *in1, float *out, int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_MUL_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0_opt_, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MUL_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulReluIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6IntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt_ = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0_opt_, vin1), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptMulRelu6IntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_MIN_N_EPI32(SIMD_MAX_N_EPI32(SIMD_MUL_EPI32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h new file mode 100644 index 00000000..ad9239fd --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/pooling_fp32_sse.h @@ -0,0 +1,84 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_POOLING_SSE_H_ +#define MINDSPORE_NNACL_FP32_POOLING_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int AvgPoolingBatchSSE(int ci, const float *src_plane_ptr, int channel, + float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, + int in_h_index, int in_w, int in_w_index, float minf, float maxf) { + SIMD_F32 min_val = SIMD_MOV_F32(minf); + SIMD_F32 max_val = SIMD_MOV_F32(maxf); + for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + SIMD_F32 tmp_avg = SIMD_SET0_F32; + int real_count = 0; + for (int h = real_win_h_start; h < real_win_h_end; h++) { + for (int w = real_win_w_start; w < real_win_w_end; w++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + h) * in_w + in_w_index + w) * channel; + tmp_avg = SIMD_ADD_F32(tmp_avg, SIMD_LD_F32(src_win_ptr)); + ++real_count; + } + } + tmp_avg = SIMD_DIV_F32(tmp_avg, SIMD_MOV_F32(real_count)); + tmp_avg = SIMD_MAX_F32(tmp_avg, min_val); + tmp_avg = SIMD_MIN_F32(tmp_avg, max_val); + SIMD_ST_F32(dst_c_ptr, tmp_avg); + } + return ci; +} + +static inline int MaxPoolingBatchSSE(int ci, const float *src_plane_ptr, int channel, + float *dst_plane_ptr, int real_win_h_start, int real_win_h_end, int real_win_w_start, int real_win_w_end, + int in_h_index, int in_w, int in_w_index, float minf, float maxf) { + SIMD_F32 min_val = SIMD_MOV_F32(minf); + SIMD_F32 max_val = SIMD_MOV_F32(maxf); + for (int block_max_size = channel - BLOCK_NUM + 1; ci < block_max_size; ci += BLOCK_NUM) { + const float *src_c_ptr = src_plane_ptr + ci; + float *dst_c_ptr = dst_plane_ptr + ci; + SIMD_F32 tmp_max = min_val; + for (int kh = real_win_h_start; kh < real_win_h_end; kh++) { + for (int kw = real_win_w_start; kw < real_win_w_end; kw++) { + const float *src_win_ptr = src_c_ptr + ((in_h_index + kh) * in_w + in_w_index + kw) * channel; + tmp_max = SIMD_MAX_F32(tmp_max, SIMD_LD_F32(src_win_ptr)); + } + } + tmp_max = SIMD_MIN_F32(tmp_max, max_val); + SIMD_ST_F32(dst_c_ptr, tmp_max); + } + return ci; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h new file mode 100644 index 00000000..4c46310e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/power_fp32_sse.h @@ -0,0 +1,101 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_POWER_SSE_H_ +#define MINDSPORE_NNACL_FP32_POWER_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int PowerBroadCastIntExponentSSE(int index, const float *input, int exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + SIMD_F32 result = SIMD_MOV_F32(1.0f); + int exp = abs(exponent); + while (exp) { + if (exp % 2) { + result = SIMD_MUL_F32(result, tmp); + } + tmp = SIMD_MUL_SQUARE_F32(tmp); + exp = exp / 2; + } + SIMD_ST_F32(output + index, exponent >= 0 ? result : SIMD_DIV_F32(SIMD_MOV_F32(1), result)); + } + return index; +} + +static inline int PowerBroadCastFloatExponentSSE(int index, const float *input, float exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + SIMD_F32 result; + for (int i = 0; i < BLOCK_NUM; ++i) { + SIMD_F32_GETI(result, i) = powf(SIMD_F32_GETI(tmp, i), exponent); + } + SIMD_ST_F32(output + index, result); + } + return index; +} + +static inline int PowerSingleExponentSSE(int index, const float *input, const float *exponent, float *output, int len, + float scale, float shift) { + SIMD_F32 scale_vec = SIMD_MOV_F32(scale); + SIMD_F32 shift_vec = SIMD_MOV_F32(shift); + for (int block_max_size = len - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 tmp_vec = SIMD_FMADD_F32(scale_vec, SIMD_LD_F32(input + index), shift_vec); + for (int j = 0; j < BLOCK_NUM; ++j) { + float cur_exponent = exponent[index + j]; + float cur_val = SIMD_F32_GETI(tmp_vec, j); + if (fabsf(cur_exponent - (int)(cur_exponent)) < 0.000001) { + int exp = abs((int)(cur_exponent)); + float result = 1; + while (exp) { + if (exp % 2) { + result *= cur_val; + } + cur_val *= cur_val; + exp = exp / 2; + } + output[index + j] = *exponent >= 0 ? result : 1 / result; + } else { + output[index + j] = powf(cur_val, cur_exponent); + } + } + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h new file mode 100644 index 00000000..936a5d51 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/reduce_fp32_sse.h @@ -0,0 +1,181 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_FP32_REDUCE_FP32_SSE_H_ +#define MINDSPORE_NNACL_FP32_REDUCE_FP32_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int64_t ReduceSumSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceMeanSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, SIMD_DIV_N_F32(tmp, axis_size)); + } + return index; +} + +static inline int64_t ReduceMinSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(FLT_MAX); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MIN_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceMaxSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(FLT_MIN); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MAX_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceProdSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(1.0f); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MUL_F32(tmp, SIMD_LD_F32(inner_src + i * inner_size)); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceSumSquareSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); + } + SIMD_ST_F32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t ReduceL2NormSSE(int64_t index, const float *outer_src, float *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const float *inner_src = outer_src + index; + SIMD_F32 tmp = SIMD_MOV_F32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_F32(tmp, SIMD_MUL_SQUARE_F32(SIMD_LD_F32(inner_src + i * inner_size))); + } + SIMD_ST_F32(outer_dst + index, SIMD_SQRT_F32(tmp)); + } + return index; +} + +static inline int64_t IntReduceSumSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t IntReduceMeanSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(0); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_ADD_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, SIMD_DIV_N_EPI32(tmp, axis_size)); + } + return index; +} + +static inline int64_t IntReduceMinSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MAX); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MIN_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +static inline int64_t IntReduceMaxSSE(int64_t index, const int *outer_src, int *outer_dst, int inner_size, + int axis_size) { + for (int block_max_size = inner_size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + const int *inner_src = outer_src + index; + SIMD_EPI32 tmp = SIMD_MOV_EPI32(INT32_MIN); + for (int i = 0; i < axis_size; i++) { + tmp = SIMD_MAX_EPI32(tmp, SIMD_LD_EPI32(inner_src + i * inner_size)); + } + SIMD_ST_EPI32(outer_dst + index, tmp); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +} +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h new file mode 100644 index 00000000..71c89ebc --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/softmax_fp32_sse.h @@ -0,0 +1,87 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_SOFTMAX_SSE_H_ +#define MINDSPORE_NNACL_FP32_SOFTMAX_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int64_t SoftmaxNormGetMaxSSE(int64_t index, const float *src, int cur_batch_offset, + float *max, int channel) { + if (channel >= BLOCK_NUM * BLOCK_NUM) { + SIMD_F32 max_val = SIMD_MOV_F32(*max); + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + max_val = SIMD_MAX_F32(max_val, SIMD_LD_F32(src + cur_batch_offset + index)); + } + *max = SIMD_GET_MAX_F32(max_val); + } + return index; +} + +static inline int64_t SoftmaxNormCalcNormSSE(int64_t index, const float *src, float *dst, + int cur_batch_offset, float max, int channel) { + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 output = SIMD_SUB_F32(SIMD_LD_F32(src + cur_batch_offset + index), SIMD_MOV_F32(max)); + SIMD_ST_F32(dst + cur_batch_offset + index, output); + } + return index; +} + +static inline int64_t SoftmaxLastAxisGetExpSumSSE(int64_t index, const float *src, float *dst, + int cur_batch_offset, float max, float *exp_sum, int channel) { +#ifndef _WIN32 + SIMD_F32 sum_val = SIMD_SET0_F32; + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); + SIMD_F32 output = SIMD_SUB_F32(input, SIMD_MOV_F32(max)); + SIMD_F32 exp_out = SIMD_EXP_F32(output); + sum_val = SIMD_ADD_F32(sum_val, exp_out); + SIMD_ST_F32(dst + cur_batch_offset + index, exp_out); + } + *exp_sum += SIMD_GET_SUM_F32(sum_val); +#endif + return index; +} + +static inline int64_t SoftmaxLastAxisGetResultSSE(int64_t index, const float *src, float *dst, + int cur_batch_offset, float exp_sum, int channel) { + SIMD_F32 exp_sum_val = SIMD_MOV_F32(exp_sum); + for (int block_max_size = channel - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 input = SIMD_LD_F32(src + cur_batch_offset + index); + SIMD_F32 output = SIMD_MUL_F32(input, exp_sum_val); + SIMD_ST_F32(dst + cur_batch_offset + index, output); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h new file mode 100644 index 00000000..a6197e19 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sse/sub_fp32_sse.h @@ -0,0 +1,167 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_NNACL_FP32_SUB_SSE_H_ +#define MINDSPORE_NNACL_FP32_SUB_SSE_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#include "nnacl/intrinsics/ms_simd_sse_instructions.h" + +#ifdef __cplusplus +extern "C" { +#endif +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define MS_SIMD_INSTRUCTION MS_SIMD_SSE_INSTRUCTION +#define BLOCK_NUM 4 +#define MS_SIMD_SSE + +static inline int ElementOptSubNum0SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0_opt, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubNum1SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1_opt_); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubIntNum0SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin0_opt = SIMD_MOV_EPI32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0_opt, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubIntNum1SSE(int index, const int *in0, const int *in1, int *out, int size) { + SIMD_EPI32 vin1_opt_ = SIMD_MOV_EPI32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1_opt_); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubReluNum0SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubReluNum1SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubRelu6Num0SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin0_opt = SIMD_MOV_F32(in0[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0_opt, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementOptSubRelu6Num1SSE(int index, const float *in0, const float *in1, float *out, + int size) { + SIMD_F32 vin1_opt_ = SIMD_MOV_F32(in1[0]); + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1_opt_), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubSSE(int index, const float *in0, const float *in1, float *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_SUB_F32(vin0, vin1); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubIntSSE(int index, const int *in0, const int *in1, int *out, int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_EPI32 vin0 = SIMD_LD_EPI32(in0 + index); + SIMD_EPI32 vin1 = SIMD_LD_EPI32(in1 + index); + SIMD_EPI32 vout = SIMD_SUB_EPI32(vin0, vin1); + SIMD_ST_EPI32(out + index, vout); + } + return index; +} + +static inline int ElementSubReluSSE(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +static inline int ElementSubRelu6SSE(int index, const float *in0, const float *in1, float *out, + int size) { + for (int block_max_size = size - BLOCK_NUM + 1; index < block_max_size; index += BLOCK_NUM) { + SIMD_F32 vin0 = SIMD_LD_F32(in0 + index); + SIMD_F32 vin1 = SIMD_LD_F32(in1 + index); + SIMD_F32 vout = SIMD_MIN_N_F32(SIMD_MAX_N_F32(SIMD_SUB_F32(vin0, vin1), 0.0f), 6.0f); + SIMD_ST_F32(out + index, vout); + } + return index; +} + +#undef MS_SIMD_INSTRUCTION +#undef BLOCK_NUM +#pragma GCC pop_options +#undef MS_SIMD_SSE +#ifdef __cplusplus +}; +#endif +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h new file mode 100644 index 00000000..894f5d7c --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/include/nnacl/sub_fp32_simd.h @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_SUB_FP32_SIMD_H_ +#define MINDSPORE_NNACL_SUB_FP32_SIMD_H_ + +#include "nnacl/intrinsics/ms_simd_instructions.h" +#ifdef ENABLE_AVX512 +#include "nnacl/avx512/sub_fp32_avx512.h" +#endif + +#ifdef ENABLE_AVX +#include "nnacl/avx/sub_fp32_avx.h" +#endif + +#ifdef ENABLE_SSE +#include "nnacl/sse/sub_fp32_sse.h" +#endif + +#ifdef ENABLE_ARM +#include "nnacl/neon/sub_fp32_neon.h" +#endif + +#endif -- 2.34.1