/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/ |
D | winograd_utils.c | 58 MS_FLOAT32X4 src[16]; in InputTransform4x4Unit() 59 MS_FLOAT32X4 t[16]; in InputTransform4x4Unit() 60 MS_FLOAT32X4 m[16]; in InputTransform4x4Unit() 114 MS_FLOAT32X4 src[36]; in InputTransform6x6Unit() 115 MS_FLOAT32X4 t[36]; in InputTransform6x6Unit() 116 MS_FLOAT32X4 m[36]; in InputTransform6x6Unit() 120 MS_FLOAT32X4 tmp1 = MS_SUBQ_F32(src[3 + offset], src[1 + offset]); in InputTransform6x6Unit() 121 MS_FLOAT32X4 tmp2 = MS_SUBQ_F32(src[4 + offset], src[2 + offset]); in InputTransform6x6Unit() 135 MS_FLOAT32X4 tmp1 = MS_SUBQ_F32(t[3 + offset], t[1 + offset]); in InputTransform6x6Unit() 136 MS_FLOAT32X4 tmp2 = MS_SUBQ_F32(t[4 + offset], t[2 + offset]); in InputTransform6x6Unit() [all …]
|
D | scale_fp32.c | 39 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]); in ScaleInner() 40 MS_FLOAT32X4 offset_4 = MS_MOVQ_F32(offset[i]); in ScaleInner() 43 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset); in ScaleInner() 44 MS_FLOAT32X4 result = MS_MLAQ_F32(offset_4, data, scale_4); in ScaleInner() 73 MS_FLOAT32X4 scale_4 = MS_LDQ_F32(scale + index); in ScaleAxis() 74 MS_FLOAT32X4 offset_4 = MS_LDQ_F32(offset + index); in ScaleAxis() 76 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset); in ScaleAxis() 77 MS_FLOAT32X4 result = MS_MLAQ_F32(offset_4, data, scale_4); in ScaleAxis() 111 MS_FLOAT32X4 zeros = {0, 0, 0, 0}; in ScaleInnerRelu() 130 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]); in ScaleInnerRelu() [all …]
|
D | add_fp32.c | 26 MS_FLOAT32X4 vin0_opt = MS_MOVQ_F32(in0[0]); in ElementOptAdd() 27 MS_FLOAT32X4 vin1_opt = MS_MOVQ_F32(in1[0]); in ElementOptAdd() 40 MS_FLOAT32X4 vin1 = MS_LDQ_F32(in1 + index); in ElementOptAdd() 41 MS_FLOAT32X4 vout = MS_ADDQ_F32(vin0_opt, vin1); in ElementOptAdd() 58 MS_FLOAT32X4 vin0 = MS_LDQ_F32(in0 + index); in ElementOptAdd() 59 MS_FLOAT32X4 vout = MS_ADDQ_F32(vin0, vin1_opt); in ElementOptAdd() 127 MS_FLOAT32X4 vin0_opt = MS_MOVQ_F32(in0[0]); in ElementOptAddRelu() 128 MS_FLOAT32X4 vin1_opt = MS_MOVQ_F32(in1[0]); in ElementOptAddRelu() 129 MS_FLOAT32X4 zeros = MS_MOVQ_F32(0.0f); in ElementOptAddRelu() 142 MS_FLOAT32X4 vin1 = MS_LDQ_F32(in1 + index); in ElementOptAddRelu() [all …]
|
D | instance_norm_fp32.c | 56 MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index); in InstanceNorm() 57 MS_FLOAT32X4 squarev = MS_MULQ_F32(srcv, srcv); in InstanceNorm() 98 MS_FLOAT32X4 meanv4 = MS_MOVQ_F32(mean); in InstanceNorm() 99 MS_FLOAT32X4 denov4 = MS_MOVQ_F32(deno); in InstanceNorm() 101 MS_FLOAT32X4 srcv4 = MS_LDQ_F32(src + index); in InstanceNorm() 102 MS_FLOAT32X4 dstv4 = in InstanceNorm() 118 MS_FLOAT32X4 hw_planev) { in InstanceNormC4HW4ArmSse() 124 MS_FLOAT32X4 mean = MS_MOVQ_F32(0.0f), mean1 = MS_MOVQ_F32(0.0f); in InstanceNormC4HW4ArmSse() 125 MS_FLOAT32X4 mean2 = MS_MOVQ_F32(0.0f), mean3 = MS_MOVQ_F32(0.0f); in InstanceNormC4HW4ArmSse() 126 MS_FLOAT32X4 squ_m = MS_MOVQ_F32(0.0f), squ_m1 = MS_MOVQ_F32(0.0f); in InstanceNormC4HW4ArmSse() [all …]
|
D | activation_fp32.c | 31 MS_FLOAT32X4 zero = MS_MOVQ_F32(0.0f); in Fp32Relu() 56 MS_FLOAT32X4 zero = MS_MOVQ_F32(0.0f); in Fp32Relu6() 57 MS_FLOAT32X4 six = MS_MOVQ_F32(6.0f); in Fp32Relu6() 59 MS_FLOAT32X4 dst_tmp = MS_MAXQ_F32(MS_LDQ_F32(src + i), zero); in Fp32Relu6() 87 MS_FLOAT32X4 src_tmp = MS_LDQ_F32(src + i); in LRelu() 88 MS_FLOAT32X4 mul_tmp = MS_MULQ_N_F32(src_tmp, alpha); in LRelu() 92 MS_FLOAT32X4 mask = MS_CMPGTQ_F32(src_tmp, MS_MOVQ_F32(0.0f)); in LRelu() 150 MS_FLOAT32X4 input = MS_LDQ_F32(src + i); in Tanh() 177 MS_FLOAT32X4 src_value = MS_LDQ_F32(src + index); in Swish() 178 MS_FLOAT32X4 sigmoid_value = MS_LDQ_F32(dst + index); in Swish() [all …]
|
D | mul_fp32.c | 37 MS_FLOAT32X4 vin0 = MS_LDQ_F32(in0 + index); in ElementMul() 38 MS_FLOAT32X4 vin1 = MS_LDQ_F32(in1 + index); in ElementMul() 39 MS_FLOAT32X4 vout = MS_MULQ_F32(vin0, vin1); in ElementMul() 62 MS_FLOAT32X4 zeros = MS_MOVQ_F32(0.0f); in ElementMulRelu() 64 MS_FLOAT32X4 vin0 = MS_LDQ_F32(in0 + index); in ElementMulRelu() 65 MS_FLOAT32X4 vin1 = MS_LDQ_F32(in1 + index); in ElementMulRelu() 66 MS_FLOAT32X4 vout = MS_MULQ_F32(vin0, vin1); in ElementMulRelu() 91 MS_FLOAT32X4 zeros = MS_MOVQ_F32(0.0f); in ElementMulRelu6() 92 MS_FLOAT32X4 bounds = MS_MOVQ_F32(6.0f); in ElementMulRelu6() 94 MS_FLOAT32X4 vin0 = MS_LDQ_F32(in0 + index); in ElementMulRelu6() [all …]
|
D | exp_fp32.h | 34 static inline MS_FLOAT32X4 VexpFp32(MS_FLOAT32X4 input) { in VexpFp32() 35 static MS_FLOAT32X4 param[] = {{0.693147f, 0.693147f, 0.693147f, 0.693147f}, in VexpFp32() 42 MS_FLOAT32X4 decimal = MS_SUBQ_F32(input, MS_MULQ_F32(MS_CVTQEPI32_PS(integer), param[0])); in VexpFp32() 44 MS_FLOAT32X4 tmp = MS_MULQ_F32(decimal, (MS_ADDQ_F32(param[2], MS_MULQ_F32(decimal, param[1])))); in VexpFp32() 46 …MS_FLOAT32X4 decimal_exp = MS_ADDQ_F32(param[5], MS_MULQ_F32(decimal, MS_ADDQ_F32(param[5], tmp))); in VexpFp32() 50 static inline void simd_exp(MS_FLOAT32X4 input, float *dst) { in simd_exp() 51 static MS_FLOAT32X4 maxv = {88.0f, 88.0f, 88.0f, 88.0f}; in simd_exp() 52 static MS_FLOAT32X4 minv = {-88.0f, -88.0f, -88.0f, -88.0f}; in simd_exp()
|
D | power_fp32.c | 21 MS_FLOAT32X4 OptimizedPowerSimd(MS_FLOAT32X4 x, const float *exponent) { in OptimizedPowerSimd() 23 MS_FLOAT32X4 result = MS_MOVQ_F32(1.0f); in OptimizedPowerSimd() 71 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale); in PowerBroadCast() 72 MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift); in PowerBroadCast() 74 …MS_FLOAT32X4 result = PowerSimdFun_(MS_ADDQ_F32(MS_MULQ_F32(scale_4, MS_LDQ_F32(input + i)), shift… in PowerBroadCast() 88 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale); in PowerSingle() 89 MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift); in PowerSingle() 91 MS_FLOAT32X4 tmp_4 = MS_ADDQ_F32(MS_MULQ_F32(scale_4, MS_LDQ_F32(input + i)), shift_4); in PowerSingle()
|
D | resize_fp32.c | 178 MS_FLOAT32X4 left_w = MS_MOVQ_F32(x_left_weights[w]); in InterpRow() 179 MS_FLOAT32X4 right_w = MS_MOVQ_F32(1.0f - x_left_weights[w]); in InterpRow() 181 MS_FLOAT32X4 left = MS_LDQ_F32(src_line + x_lefts[w] * in_c + c); in InterpRow() 182 MS_FLOAT32X4 right = MS_LDQ_F32(src_line + x_rights[w] * in_c + c); in InterpRow() 183 … MS_FLOAT32X4 interp_value = MS_ADDQ_F32(MS_MULQ_F32(left, left_w), MS_MULQ_F32(right, right_w)); in InterpRow() 214 MS_FLOAT32X4 bottom_w = MS_MOVQ_F32(y_bottom_weight); in InterpCol() 215 MS_FLOAT32X4 top_w = MS_MOVQ_F32(1.0f - y_bottom_weight); in InterpCol() 217 MS_FLOAT32X4 bottom = MS_LDQ_F32(bottom_line + w * in_c + c); in InterpCol() 218 MS_FLOAT32X4 top = MS_LDQ_F32(top_line + w * in_c + c); in InterpCol() 219 … MS_FLOAT32X4 interp_value = MS_ADDQ_F32(MS_MULQ_F32(bottom, bottom_w), MS_MULQ_F32(top, top_w)); in InterpCol() [all …]
|
D | prelu_fp32.c | 118 MS_FLOAT32X4 in = MS_LDQ_F32(cur_in + j); in PRelu() 119 MS_FLOAT32X4 s = MS_LDQ_F32(slope + j); in PRelu() 120 MS_FLOAT32X4 mul = MS_MULQ_F32(in, s); in PRelu() 121 MS_FLOAT32X4 zero = MS_MOVQ_F32(0.0f); in PRelu() 122 MS_FLOAT32X4 res = MS_BLENDQ_F32(mul, in, MS_CMPGTQ_F32(in, zero)); in PRelu() 150 MS_FLOAT32X4 src_tmp = MS_LDQ_F32(input + i); in PReluShareChannel() 151 MS_FLOAT32X4 mul_tmp = MS_MULQ_N_F32(src_tmp, slope); in PReluShareChannel() 155 MS_FLOAT32X4 mask = MS_CMPGTQ_F32(src_tmp, MS_MOVQ_F32(0.0f)); in PReluShareChannel()
|
D | power_fp32.h | 26 typedef MS_FLOAT32X4 (*PowerSimdFun)(MS_FLOAT32X4 x, const float *exponent); 39 static inline MS_FLOAT32X4 StdPowerSimd(MS_FLOAT32X4 x, const float *exponent) { in StdPowerSimd() 40 MS_FLOAT32X4 result; in StdPowerSimd()
|
D | cumsum_fp32.c | 32 MS_FLOAT32X4 val = MS_LDQ_F32(layer_input + j); in Cumsum() 46 MS_FLOAT32X4 zero_val = MS_MOVQ_F32(0.0f); in Cumsum() 65 MS_FLOAT32X4 input_val = MS_LDQ_F32(layer_input + k); in Cumsum() 66 MS_FLOAT32X4 last_output_val = MS_LDQ_F32(layer_last_output + k); in Cumsum() 67 MS_FLOAT32X4 out_val = MS_ADDQ_F32(input_val, last_output_val); in Cumsum() 92 MS_FLOAT32X4 val = MS_LDQ_F32(layer_input + j); in CumsumReverse() 106 MS_FLOAT32X4 zero_val = MS_MOVQ_F32(0.0f); in CumsumReverse() 125 MS_FLOAT32X4 input_val = MS_LDQ_F32(layer_input - k - 3); in CumsumReverse() 126 MS_FLOAT32X4 last_output_val = MS_LDQ_F32(layer_last_output - k - 3); in CumsumReverse() 127 MS_FLOAT32X4 out_val = MS_ADDQ_F32(input_val, last_output_val); in CumsumReverse()
|
D | conv_depthwise_fp32.c | 373 MS_FLOAT32X4 v0, v1, v2, v3; in ConvDw3x3RowLeft() 380 MS_FLOAT32X4 b0 = MS_SUBQ_F32(v0, v2); in ConvDw3x3RowLeft() 381 MS_FLOAT32X4 b1 = MS_ADDQ_F32(v1, v2); in ConvDw3x3RowLeft() 382 MS_FLOAT32X4 b2 = MS_SUBQ_F32(v2, v1); in ConvDw3x3RowLeft() 383 MS_FLOAT32X4 b3 = MS_SUBQ_F32(v3, v1); in ConvDw3x3RowLeft() 405 MS_FLOAT32X4 v0, v1, v2, v3; in ConvDw3x3RowMiddle() 412 MS_FLOAT32X4 b0 = MS_SUBQ_F32(v0, v2); in ConvDw3x3RowMiddle() 413 MS_FLOAT32X4 b1 = MS_ADDQ_F32(v1, v2); in ConvDw3x3RowMiddle() 414 MS_FLOAT32X4 b2 = MS_SUBQ_F32(v2, v1); in ConvDw3x3RowMiddle() 415 MS_FLOAT32X4 b3 = MS_SUBQ_F32(v3, v1); in ConvDw3x3RowMiddle() [all …]
|
D | pooling_fp32.c | 41 MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf); in AvgPooling() 42 MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); in AvgPooling() 92 MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0); in AvgPooling() 156 MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf); in MaxPooling() 157 MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); in MaxPooling() 201 MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX); in MaxPooling()
|
D | exp_fp32.c | 53 MS_FLOAT32X4 scale = MS_MOVQ_F32(param->in_scale_); in ExpFusionFp32() 66 … MS_FLOAT32X4 scale = {param->out_scale_, param->out_scale_, param->out_scale_, param->out_scale_}; in ExpFusionFp32()
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ |
D | ms_simd_instructions.h | 46 #define MS_FLOAT32X4 float32x4_t macro 124 #define MS_FLOAT32X4 __m128 macro 157 static inline MS_FLOAT32X4 MS_SQRTFX4_F32(MS_FLOAT32X4 src) { in MS_SQRTFX4_F32() 158 MS_FLOAT32X4 dst; in MS_SQRTFX4_F32() 167 MS_FLOAT32X4 src##1 = MS_LDQ_F32(input_ptr + 0 * num); \ 168 MS_FLOAT32X4 src##2 = MS_LDQ_F32(input_ptr + 1 * num); \ 169 MS_FLOAT32X4 src##3 = MS_LDQ_F32(input_ptr + 2 * num); \ 170 MS_FLOAT32X4 src##4 = MS_LDQ_F32(input_ptr + 3 * num); \ 171 MS_FLOAT32X4 src##5 = MS_LDQ_F32(input_ptr + 4 * num); \ 172 MS_FLOAT32X4 src##6 = MS_LDQ_F32(input_ptr + 5 * num); \ [all …]
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/ |
D | minimal_filtering_generator.h | 56 void MatrixMultiplyVec(const MS_FLOAT32X4 *matrix_a, const MS_FLOAT32X4 *matrix_b, MS_FLOAT32X4 *ma…
|
D | minimal_filtering_generator.c | 234 void MatrixMultiplyVec(const MS_FLOAT32X4 *matrix_a, const MS_FLOAT32X4 *matrix_b, MS_FLOAT32X4 *ma… in MatrixMultiplyVec() 237 MS_FLOAT32X4 bias_ptr = MS_MOVQ_F32(0); in MatrixMultiplyVec() 244 MS_FLOAT32X4 res = MS_MOVQ_F32(0); in MatrixMultiplyVec()
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/ |
D | softmax.c | 27 MS_FLOAT32X4 input = vld1q_f32(src + i); in ExpFp32Offset() 28 MS_FLOAT32X4 bias = vdupq_n_f32(sub_bias); in ExpFp32Offset() 29 MS_FLOAT32X4 i1 = vsubq_f32(input, bias); in ExpFp32Offset()
|