Home
last modified time | relevance | path

Searched refs:MS_FLOAT32X4 (Results 1 – 19 of 19) sorted by relevance

/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/
Dwinograd_utils.c58 MS_FLOAT32X4 src[16]; in InputTransform4x4Unit()
59 MS_FLOAT32X4 t[16]; in InputTransform4x4Unit()
60 MS_FLOAT32X4 m[16]; in InputTransform4x4Unit()
114 MS_FLOAT32X4 src[36]; in InputTransform6x6Unit()
115 MS_FLOAT32X4 t[36]; in InputTransform6x6Unit()
116 MS_FLOAT32X4 m[36]; in InputTransform6x6Unit()
120 MS_FLOAT32X4 tmp1 = MS_SUBQ_F32(src[3 + offset], src[1 + offset]); in InputTransform6x6Unit()
121 MS_FLOAT32X4 tmp2 = MS_SUBQ_F32(src[4 + offset], src[2 + offset]); in InputTransform6x6Unit()
135 MS_FLOAT32X4 tmp1 = MS_SUBQ_F32(t[3 + offset], t[1 + offset]); in InputTransform6x6Unit()
136 MS_FLOAT32X4 tmp2 = MS_SUBQ_F32(t[4 + offset], t[2 + offset]); in InputTransform6x6Unit()
[all …]
Dscale_fp32.c39 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]); in ScaleInner()
40 MS_FLOAT32X4 offset_4 = MS_MOVQ_F32(offset[i]); in ScaleInner()
43 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset); in ScaleInner()
44 MS_FLOAT32X4 result = MS_MLAQ_F32(offset_4, data, scale_4); in ScaleInner()
73 MS_FLOAT32X4 scale_4 = MS_LDQ_F32(scale + index); in ScaleAxis()
74 MS_FLOAT32X4 offset_4 = MS_LDQ_F32(offset + index); in ScaleAxis()
76 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset); in ScaleAxis()
77 MS_FLOAT32X4 result = MS_MLAQ_F32(offset_4, data, scale_4); in ScaleAxis()
111 MS_FLOAT32X4 zeros = {0, 0, 0, 0}; in ScaleInnerRelu()
130 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]); in ScaleInnerRelu()
[all …]
Dadd_fp32.c26 MS_FLOAT32X4 vin0_opt = MS_MOVQ_F32(in0[0]); in ElementOptAdd()
27 MS_FLOAT32X4 vin1_opt = MS_MOVQ_F32(in1[0]); in ElementOptAdd()
40 MS_FLOAT32X4 vin1 = MS_LDQ_F32(in1 + index); in ElementOptAdd()
41 MS_FLOAT32X4 vout = MS_ADDQ_F32(vin0_opt, vin1); in ElementOptAdd()
58 MS_FLOAT32X4 vin0 = MS_LDQ_F32(in0 + index); in ElementOptAdd()
59 MS_FLOAT32X4 vout = MS_ADDQ_F32(vin0, vin1_opt); in ElementOptAdd()
127 MS_FLOAT32X4 vin0_opt = MS_MOVQ_F32(in0[0]); in ElementOptAddRelu()
128 MS_FLOAT32X4 vin1_opt = MS_MOVQ_F32(in1[0]); in ElementOptAddRelu()
129 MS_FLOAT32X4 zeros = MS_MOVQ_F32(0.0f); in ElementOptAddRelu()
142 MS_FLOAT32X4 vin1 = MS_LDQ_F32(in1 + index); in ElementOptAddRelu()
[all …]
Dinstance_norm_fp32.c56 MS_FLOAT32X4 srcv = MS_LDQ_F32(src + index); in InstanceNorm()
57 MS_FLOAT32X4 squarev = MS_MULQ_F32(srcv, srcv); in InstanceNorm()
98 MS_FLOAT32X4 meanv4 = MS_MOVQ_F32(mean); in InstanceNorm()
99 MS_FLOAT32X4 denov4 = MS_MOVQ_F32(deno); in InstanceNorm()
101 MS_FLOAT32X4 srcv4 = MS_LDQ_F32(src + index); in InstanceNorm()
102 MS_FLOAT32X4 dstv4 = in InstanceNorm()
118 MS_FLOAT32X4 hw_planev) { in InstanceNormC4HW4ArmSse()
124 MS_FLOAT32X4 mean = MS_MOVQ_F32(0.0f), mean1 = MS_MOVQ_F32(0.0f); in InstanceNormC4HW4ArmSse()
125 MS_FLOAT32X4 mean2 = MS_MOVQ_F32(0.0f), mean3 = MS_MOVQ_F32(0.0f); in InstanceNormC4HW4ArmSse()
126 MS_FLOAT32X4 squ_m = MS_MOVQ_F32(0.0f), squ_m1 = MS_MOVQ_F32(0.0f); in InstanceNormC4HW4ArmSse()
[all …]
Dactivation_fp32.c31 MS_FLOAT32X4 zero = MS_MOVQ_F32(0.0f); in Fp32Relu()
56 MS_FLOAT32X4 zero = MS_MOVQ_F32(0.0f); in Fp32Relu6()
57 MS_FLOAT32X4 six = MS_MOVQ_F32(6.0f); in Fp32Relu6()
59 MS_FLOAT32X4 dst_tmp = MS_MAXQ_F32(MS_LDQ_F32(src + i), zero); in Fp32Relu6()
87 MS_FLOAT32X4 src_tmp = MS_LDQ_F32(src + i); in LRelu()
88 MS_FLOAT32X4 mul_tmp = MS_MULQ_N_F32(src_tmp, alpha); in LRelu()
92 MS_FLOAT32X4 mask = MS_CMPGTQ_F32(src_tmp, MS_MOVQ_F32(0.0f)); in LRelu()
150 MS_FLOAT32X4 input = MS_LDQ_F32(src + i); in Tanh()
177 MS_FLOAT32X4 src_value = MS_LDQ_F32(src + index); in Swish()
178 MS_FLOAT32X4 sigmoid_value = MS_LDQ_F32(dst + index); in Swish()
[all …]
Dmul_fp32.c37 MS_FLOAT32X4 vin0 = MS_LDQ_F32(in0 + index); in ElementMul()
38 MS_FLOAT32X4 vin1 = MS_LDQ_F32(in1 + index); in ElementMul()
39 MS_FLOAT32X4 vout = MS_MULQ_F32(vin0, vin1); in ElementMul()
62 MS_FLOAT32X4 zeros = MS_MOVQ_F32(0.0f); in ElementMulRelu()
64 MS_FLOAT32X4 vin0 = MS_LDQ_F32(in0 + index); in ElementMulRelu()
65 MS_FLOAT32X4 vin1 = MS_LDQ_F32(in1 + index); in ElementMulRelu()
66 MS_FLOAT32X4 vout = MS_MULQ_F32(vin0, vin1); in ElementMulRelu()
91 MS_FLOAT32X4 zeros = MS_MOVQ_F32(0.0f); in ElementMulRelu6()
92 MS_FLOAT32X4 bounds = MS_MOVQ_F32(6.0f); in ElementMulRelu6()
94 MS_FLOAT32X4 vin0 = MS_LDQ_F32(in0 + index); in ElementMulRelu6()
[all …]
Dexp_fp32.h34 static inline MS_FLOAT32X4 VexpFp32(MS_FLOAT32X4 input) { in VexpFp32()
35 static MS_FLOAT32X4 param[] = {{0.693147f, 0.693147f, 0.693147f, 0.693147f}, in VexpFp32()
42 MS_FLOAT32X4 decimal = MS_SUBQ_F32(input, MS_MULQ_F32(MS_CVTQEPI32_PS(integer), param[0])); in VexpFp32()
44 MS_FLOAT32X4 tmp = MS_MULQ_F32(decimal, (MS_ADDQ_F32(param[2], MS_MULQ_F32(decimal, param[1])))); in VexpFp32()
46MS_FLOAT32X4 decimal_exp = MS_ADDQ_F32(param[5], MS_MULQ_F32(decimal, MS_ADDQ_F32(param[5], tmp))); in VexpFp32()
50 static inline void simd_exp(MS_FLOAT32X4 input, float *dst) { in simd_exp()
51 static MS_FLOAT32X4 maxv = {88.0f, 88.0f, 88.0f, 88.0f}; in simd_exp()
52 static MS_FLOAT32X4 minv = {-88.0f, -88.0f, -88.0f, -88.0f}; in simd_exp()
Dpower_fp32.c21 MS_FLOAT32X4 OptimizedPowerSimd(MS_FLOAT32X4 x, const float *exponent) { in OptimizedPowerSimd()
23 MS_FLOAT32X4 result = MS_MOVQ_F32(1.0f); in OptimizedPowerSimd()
71 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale); in PowerBroadCast()
72 MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift); in PowerBroadCast()
74MS_FLOAT32X4 result = PowerSimdFun_(MS_ADDQ_F32(MS_MULQ_F32(scale_4, MS_LDQ_F32(input + i)), shift… in PowerBroadCast()
88 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale); in PowerSingle()
89 MS_FLOAT32X4 shift_4 = MS_MOVQ_F32(shift); in PowerSingle()
91 MS_FLOAT32X4 tmp_4 = MS_ADDQ_F32(MS_MULQ_F32(scale_4, MS_LDQ_F32(input + i)), shift_4); in PowerSingle()
Dresize_fp32.c178 MS_FLOAT32X4 left_w = MS_MOVQ_F32(x_left_weights[w]); in InterpRow()
179 MS_FLOAT32X4 right_w = MS_MOVQ_F32(1.0f - x_left_weights[w]); in InterpRow()
181 MS_FLOAT32X4 left = MS_LDQ_F32(src_line + x_lefts[w] * in_c + c); in InterpRow()
182 MS_FLOAT32X4 right = MS_LDQ_F32(src_line + x_rights[w] * in_c + c); in InterpRow()
183MS_FLOAT32X4 interp_value = MS_ADDQ_F32(MS_MULQ_F32(left, left_w), MS_MULQ_F32(right, right_w)); in InterpRow()
214 MS_FLOAT32X4 bottom_w = MS_MOVQ_F32(y_bottom_weight); in InterpCol()
215 MS_FLOAT32X4 top_w = MS_MOVQ_F32(1.0f - y_bottom_weight); in InterpCol()
217 MS_FLOAT32X4 bottom = MS_LDQ_F32(bottom_line + w * in_c + c); in InterpCol()
218 MS_FLOAT32X4 top = MS_LDQ_F32(top_line + w * in_c + c); in InterpCol()
219MS_FLOAT32X4 interp_value = MS_ADDQ_F32(MS_MULQ_F32(bottom, bottom_w), MS_MULQ_F32(top, top_w)); in InterpCol()
[all …]
Dprelu_fp32.c118 MS_FLOAT32X4 in = MS_LDQ_F32(cur_in + j); in PRelu()
119 MS_FLOAT32X4 s = MS_LDQ_F32(slope + j); in PRelu()
120 MS_FLOAT32X4 mul = MS_MULQ_F32(in, s); in PRelu()
121 MS_FLOAT32X4 zero = MS_MOVQ_F32(0.0f); in PRelu()
122 MS_FLOAT32X4 res = MS_BLENDQ_F32(mul, in, MS_CMPGTQ_F32(in, zero)); in PRelu()
150 MS_FLOAT32X4 src_tmp = MS_LDQ_F32(input + i); in PReluShareChannel()
151 MS_FLOAT32X4 mul_tmp = MS_MULQ_N_F32(src_tmp, slope); in PReluShareChannel()
155 MS_FLOAT32X4 mask = MS_CMPGTQ_F32(src_tmp, MS_MOVQ_F32(0.0f)); in PReluShareChannel()
Dpower_fp32.h26 typedef MS_FLOAT32X4 (*PowerSimdFun)(MS_FLOAT32X4 x, const float *exponent);
39 static inline MS_FLOAT32X4 StdPowerSimd(MS_FLOAT32X4 x, const float *exponent) { in StdPowerSimd()
40 MS_FLOAT32X4 result; in StdPowerSimd()
Dcumsum_fp32.c32 MS_FLOAT32X4 val = MS_LDQ_F32(layer_input + j); in Cumsum()
46 MS_FLOAT32X4 zero_val = MS_MOVQ_F32(0.0f); in Cumsum()
65 MS_FLOAT32X4 input_val = MS_LDQ_F32(layer_input + k); in Cumsum()
66 MS_FLOAT32X4 last_output_val = MS_LDQ_F32(layer_last_output + k); in Cumsum()
67 MS_FLOAT32X4 out_val = MS_ADDQ_F32(input_val, last_output_val); in Cumsum()
92 MS_FLOAT32X4 val = MS_LDQ_F32(layer_input + j); in CumsumReverse()
106 MS_FLOAT32X4 zero_val = MS_MOVQ_F32(0.0f); in CumsumReverse()
125 MS_FLOAT32X4 input_val = MS_LDQ_F32(layer_input - k - 3); in CumsumReverse()
126 MS_FLOAT32X4 last_output_val = MS_LDQ_F32(layer_last_output - k - 3); in CumsumReverse()
127 MS_FLOAT32X4 out_val = MS_ADDQ_F32(input_val, last_output_val); in CumsumReverse()
Dconv_depthwise_fp32.c373 MS_FLOAT32X4 v0, v1, v2, v3; in ConvDw3x3RowLeft()
380 MS_FLOAT32X4 b0 = MS_SUBQ_F32(v0, v2); in ConvDw3x3RowLeft()
381 MS_FLOAT32X4 b1 = MS_ADDQ_F32(v1, v2); in ConvDw3x3RowLeft()
382 MS_FLOAT32X4 b2 = MS_SUBQ_F32(v2, v1); in ConvDw3x3RowLeft()
383 MS_FLOAT32X4 b3 = MS_SUBQ_F32(v3, v1); in ConvDw3x3RowLeft()
405 MS_FLOAT32X4 v0, v1, v2, v3; in ConvDw3x3RowMiddle()
412 MS_FLOAT32X4 b0 = MS_SUBQ_F32(v0, v2); in ConvDw3x3RowMiddle()
413 MS_FLOAT32X4 b1 = MS_ADDQ_F32(v1, v2); in ConvDw3x3RowMiddle()
414 MS_FLOAT32X4 b2 = MS_SUBQ_F32(v2, v1); in ConvDw3x3RowMiddle()
415 MS_FLOAT32X4 b3 = MS_SUBQ_F32(v3, v1); in ConvDw3x3RowMiddle()
[all …]
Dpooling_fp32.c41 MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf); in AvgPooling()
42 MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); in AvgPooling()
92 MS_FLOAT32X4 tmp_avg = MS_MOVQ_F32(0); in AvgPooling()
156 MS_FLOAT32X4 min_value = MS_MOVQ_F32(minf); in MaxPooling()
157 MS_FLOAT32X4 max_value = MS_MOVQ_F32(maxf); in MaxPooling()
201 MS_FLOAT32X4 tmp_max = MS_MOVQ_F32(-FLT_MAX); in MaxPooling()
Dexp_fp32.c53 MS_FLOAT32X4 scale = MS_MOVQ_F32(param->in_scale_); in ExpFusionFp32()
66MS_FLOAT32X4 scale = {param->out_scale_, param->out_scale_, param->out_scale_, param->out_scale_}; in ExpFusionFp32()
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/
Dms_simd_instructions.h46 #define MS_FLOAT32X4 float32x4_t macro
124 #define MS_FLOAT32X4 __m128 macro
157 static inline MS_FLOAT32X4 MS_SQRTFX4_F32(MS_FLOAT32X4 src) { in MS_SQRTFX4_F32()
158 MS_FLOAT32X4 dst; in MS_SQRTFX4_F32()
167 MS_FLOAT32X4 src##1 = MS_LDQ_F32(input_ptr + 0 * num); \
168 MS_FLOAT32X4 src##2 = MS_LDQ_F32(input_ptr + 1 * num); \
169 MS_FLOAT32X4 src##3 = MS_LDQ_F32(input_ptr + 2 * num); \
170 MS_FLOAT32X4 src##4 = MS_LDQ_F32(input_ptr + 3 * num); \
171 MS_FLOAT32X4 src##5 = MS_LDQ_F32(input_ptr + 4 * num); \
172 MS_FLOAT32X4 src##6 = MS_LDQ_F32(input_ptr + 5 * num); \
[all …]
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/
Dminimal_filtering_generator.h56 void MatrixMultiplyVec(const MS_FLOAT32X4 *matrix_a, const MS_FLOAT32X4 *matrix_b, MS_FLOAT32X4 *ma…
Dminimal_filtering_generator.c234 void MatrixMultiplyVec(const MS_FLOAT32X4 *matrix_a, const MS_FLOAT32X4 *matrix_b, MS_FLOAT32X4 *ma… in MatrixMultiplyVec()
237 MS_FLOAT32X4 bias_ptr = MS_MOVQ_F32(0); in MatrixMultiplyVec()
244 MS_FLOAT32X4 res = MS_MOVQ_F32(0); in MatrixMultiplyVec()
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/
Dsoftmax.c27 MS_FLOAT32X4 input = vld1q_f32(src + i); in ExpFp32Offset()
28 MS_FLOAT32X4 bias = vdupq_n_f32(sub_bias); in ExpFp32Offset()
29 MS_FLOAT32X4 i1 = vsubq_f32(input, bias); in ExpFp32Offset()