/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/ |
D | winograd_avx.c | 22 MS_FLOAT32X8 src[16]; in OutputTransform4x2AvxUnit() 23 MS_FLOAT32X8 t[8]; in OutputTransform4x2AvxUnit() 24 MS_FLOAT32X8 m[4]; in OutputTransform4x2AvxUnit() 26 MS_FLOAT32X8 bias_ptr = MS_LD256_F32(bias_data); in OutputTransform4x2AvxUnit() 54 MS_FLOAT32X8 src[16]; in OutputTransform4x2ReluAvxUnit() 55 MS_FLOAT32X8 t[8]; in OutputTransform4x2ReluAvxUnit() 56 MS_FLOAT32X8 m[4]; in OutputTransform4x2ReluAvxUnit() 57 MS_FLOAT32X8 zero = MS_MOV256_F32(0); in OutputTransform4x2ReluAvxUnit() 59 MS_FLOAT32X8 bias_ptr = MS_LD256_F32(bias_data); in OutputTransform4x2ReluAvxUnit() 89 MS_FLOAT32X8 src[16]; in OutputTransform4x2Relu6AvxUnit() [all …]
|
D | scale_fp32.c | 29 MS_FLOAT32X8 scale_8 = MS_MOV256_F32(scale[i]); in ScaleInner() 30 MS_FLOAT32X8 offset_8 = MS_MOV256_F32(offset[i]); in ScaleInner() 33 MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset); in ScaleInner() 34 MS_FLOAT32X8 result = MS_MLA256_F32(offset_8, data, scale_8); in ScaleInner() 64 MS_FLOAT32X8 scale_8 = MS_LD256_F32(scale + index); in ScaleAxis() 65 MS_FLOAT32X8 offset_8 = MS_LD256_F32(offset + index); in ScaleAxis() 66 MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset); in ScaleAxis() 67 MS_FLOAT32X8 result = MS_MLA256_F32(offset_8, data, scale_8); in ScaleAxis() 108 MS_FLOAT32X8 zeros_8 = {0, 0, 0, 0, 0, 0, 0, 0}; in ScaleInnerRelu() 119 MS_FLOAT32X8 scale_8 = MS_MOV256_F32(scale[i]); in ScaleInnerRelu() [all …]
|
D | add_fp32.c | 22 MS_FLOAT32X8 vin0_opt_8 = MS_MOV256_F32(in0[0]); in ElementOptAdd() 23 MS_FLOAT32X8 vin1_opt_8 = MS_MOV256_F32(in1[0]); in ElementOptAdd() 33 MS_FLOAT32X8 vin1 = MS_LD256_F32(in1 + index); in ElementOptAdd() 34 MS_FLOAT32X8 vout = MS_ADD256_F32(vin0_opt_8, vin1); in ElementOptAdd() 51 MS_FLOAT32X8 vin0 = MS_LD256_F32(in0 + index); in ElementOptAdd() 52 MS_FLOAT32X8 vout = MS_ADD256_F32(vin0, vin1_opt_8); in ElementOptAdd() 122 MS_FLOAT32X8 vin0_opt_8 = MS_MOV256_F32(in0[0]); in ElementOptAddRelu() 123 MS_FLOAT32X8 vin1_opt_8 = MS_MOV256_F32(in1[0]); in ElementOptAddRelu() 124 MS_FLOAT32X8 zeros_8 = MS_MOV256_F32(0.0f); in ElementOptAddRelu() 135 MS_FLOAT32X8 vin1 = MS_LD256_F32(in1 + index); in ElementOptAddRelu() [all …]
|
D | mul_fp32.c | 29 MS_FLOAT32X8 vin0 = MS_LD256_F32(in0 + index); in ElementMul() 30 MS_FLOAT32X8 vin1 = MS_LD256_F32(in1 + index); in ElementMul() 31 MS_FLOAT32X8 vout = MS_MUL256_F32(vin0, vin1); in ElementMul() 52 MS_FLOAT32X8 zeros_8 = MS_MOV256_F32(0.0f); in ElementMulRelu() 54 MS_FLOAT32X8 vin0 = MS_LD256_F32(in0 + index); in ElementMulRelu() 55 MS_FLOAT32X8 vin1 = MS_LD256_F32(in1 + index); in ElementMulRelu() 56 MS_FLOAT32X8 vout = MS_MUL256_F32(vin0, vin1); in ElementMulRelu() 81 MS_FLOAT32X8 zeros_8 = MS_MOV256_F32(0.0f); in ElementMulRelu6() 82 MS_FLOAT32X8 bounds_8 = MS_MOV256_F32(6.0f); in ElementMulRelu6() 84 MS_FLOAT32X8 vin0 = MS_LD256_F32(in0 + index); in ElementMulRelu6() [all …]
|
D | resize_fp32.c | 168 MS_FLOAT32X8 left_w_8 = MS_MOV256_F32(x_left_weights[w]); in InterpRow() 169 MS_FLOAT32X8 right_w_8 = MS_MOV256_F32(1.0f - x_left_weights[w]); in InterpRow() 171 MS_FLOAT32X8 left = MS_LD256_F32(src_line + x_lefts[w] * in_c + c); in InterpRow() 172 MS_FLOAT32X8 right = MS_LD256_F32(src_line + x_rights[w] * in_c + c); in InterpRow() 173 …MS_FLOAT32X8 interp_value = MS_ADD256_F32(MS_MUL256_F32(left, left_w_8), MS_MUL256_F32(right, righ… in InterpRow() 204 MS_FLOAT32X8 bottom_w_8 = MS_MOV256_F32(y_bottom_weight); in InterpCol() 205 MS_FLOAT32X8 top_w_8 = MS_MOV256_F32(1.0f - y_bottom_weight); in InterpCol() 207 MS_FLOAT32X8 bottom = MS_LD256_F32(bottom_line + w * in_c + c); in InterpCol() 208 MS_FLOAT32X8 top = MS_LD256_F32(top_line + w * in_c + c); in InterpCol() 209 …MS_FLOAT32X8 interp_value = MS_ADD256_F32(MS_MUL256_F32(bottom, bottom_w_8), MS_MUL256_F32(top, to… in InterpCol() [all …]
|
D | activation_fp32.c | 24 MS_FLOAT32X8 zero_8 = MS_MOV256_F32(0.0f); in Fp32Relu() 46 MS_FLOAT32X8 zero_8 = MS_MOV256_F32(0.0f); in Fp32Relu6() 47 MS_FLOAT32X8 six_8 = MS_MOV256_F32(6.0f); in Fp32Relu6() 49 MS_FLOAT32X8 dst_tmp = MS_MAX256_F32(MS_LD256_F32(src + i), zero_8); in Fp32Relu6() 78 MS_FLOAT32X8 src_tmp = MS_LD256_F32(src + i); in LRelu() 79 MS_FLOAT32X8 mul_tmp = MS_MUL256_N_F32(src_tmp, alpha); in LRelu() 80 MS_FLOAT32X8 mask = MS_CMP256_F32(src_tmp, MS_MOV256_F32(0.0f), 30); in LRelu() 143 MS_FLOAT32X8 input = MS_LD256_F32(src + i); in Tanh() 168 MS_FLOAT32X8 src_value = MS_LD256_F32(src + index); in Swish() 169 MS_FLOAT32X8 sigmoid_value = MS_LD256_F32(dst + index); in Swish() [all …]
|
D | exp_fp32.h | 59 static inline void simd_exp_avx(MS_FLOAT32X8 input, float *dst) { in simd_exp_avx() 60 static MS_FLOAT32X8 maxv = {88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f}; in simd_exp_avx() 61 static MS_FLOAT32X8 minv = {-88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f}; in simd_exp_avx() 62 static MS_FLOAT32X8 param[] = { in simd_exp_avx() 71 MS_FLOAT32X8 decimal = MS_SUB256_F32(input, MS_MUL256_F32(MS_CVT256EPI32_PS(integer), param[0])); in simd_exp_avx() 73 …MS_FLOAT32X8 tmp = MS_MUL256_F32(decimal, (MS_ADD256_F32(param[2], MS_MUL256_F32(decimal, param[1]… in simd_exp_avx() 75 …MS_FLOAT32X8 decimal_exp = MS_ADD256_F32(param[5], MS_MUL256_F32(decimal, MS_ADD256_F32(param[5], … in simd_exp_avx()
|
D | instance_norm_fp32.c | 87 MS_FLOAT32X8 meanv8 = MS_MOV256_F32(mean); in InstanceNorm() 88 MS_FLOAT32X8 denov8 = MS_MOV256_F32(deno); in InstanceNorm() 90 MS_FLOAT32X8 srcv8 = MS_LD256_F32(src + index); in InstanceNorm() 91 MS_FLOAT32X8 dstv8 = in InstanceNorm() 275 MS_FLOAT32X8 hw_planev = MS_MOV256_F32((float)(hw_plane)); in InstanceNormNC8HW8() 284 MS_FLOAT32X8 mean = MS_MOV256_F32(0.0f), mean1 = MS_MOV256_F32(0.0f); in InstanceNormNC8HW8() 285 MS_FLOAT32X8 squ_m = MS_MOV256_F32(0.0f), squ_m1 = MS_MOV256_F32(0.0f); in InstanceNormNC8HW8() 287 … MS_FLOAT32X8 srcv = MS_LD256_F32(src + index * C8NUM), srcv1 = MS_LD256_F32(src1 + index * C8NUM); in InstanceNormNC8HW8() 288 MS_FLOAT32X8 squarev = MS_MUL256_F32(srcv, srcv), squarev1 = MS_MUL256_F32(srcv1, srcv1); in InstanceNormNC8HW8() 298 MS_FLOAT32X8 deno = in InstanceNormNC8HW8() [all …]
|
D | sub_fp32.c | 20 MS_FLOAT32X8 vin0_opt_8 = MS_MOV256_F32(in0[0]); in ElementOptSub() 21 MS_FLOAT32X8 vin1_opt_8 = MS_MOV256_F32(in1[0]); in ElementOptSub() 31 MS_FLOAT32X8 vin1 = MS_LD256_F32(in1 + index); in ElementOptSub() 32 MS_FLOAT32X8 vout = MS_SUB256_F32(vin0_opt_8, vin1); in ElementOptSub() 49 MS_FLOAT32X8 vin0 = MS_LD256_F32(in0 + index); in ElementOptSub() 50 MS_FLOAT32X8 vout = MS_SUB256_F32(vin0, vin1_opt_8); in ElementOptSub() 179 MS_FLOAT32X8 vin0 = MS_LD256_F32(in0 + index); in ElementSub() 180 MS_FLOAT32X8 vin1 = MS_LD256_F32(in1 + index); in ElementSub() 181 MS_FLOAT32X8 vout = MS_SUB256_F32(vin0, vin1); in ElementSub()
|
D | pooling_fp32.c | 36 MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf); in AvgPooling() 37 MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf); in AvgPooling() 70 MS_FLOAT32X8 tmp_avg = MS_MOV256_F32(0); in AvgPooling() 151 MS_FLOAT32X8 min_value_8 = MS_MOV256_F32(minf); in MaxPooling() 152 MS_FLOAT32X8 max_value_8 = MS_MOV256_F32(maxf); in MaxPooling() 185 MS_FLOAT32X8 tmp_max = MS_MOV256_F32(-FLT_MAX); in MaxPooling()
|
D | prelu_fp32.c | 141 MS_FLOAT32X8 src_tmp = MS_LD256_F32(input + i); in PReluShareChannel() 142 MS_FLOAT32X8 mul_tmp = MS_MUL256_N_F32(src_tmp, slope); in PReluShareChannel() 143 MS_FLOAT32X8 mask = MS_CMP256_F32(src_tmp, MS_MOV256_F32(0.0f), mask_offset); in PReluShareChannel()
|
D | reduce_fp32.c | 473 MS_FLOAT32X8 tmp = {0, 0, 0, 0, 0, 0, 0, 0}; in ReduceSumDim2Axis0() 505 MS_FLOAT32X8 tmp_arr_8 = MS_MOV256_F32(tmp_arr[0]); in ReduceSumDim2Axis1() 507 MS_FLOAT32X8 src_in = MS_LD256_F32(src_data + k); in ReduceSumDim2Axis1()
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/intrinsics/ |
D | ms_simd_instructions.h | 91 #define MS_FLOAT32X8 __m256 macro 215 static inline MS_FLOAT32X8 MS_SQRTFX8_F32(MS_FLOAT32X8 src) { in MS_SQRTFX8_F32() 216 MS_FLOAT32X8 dst; in MS_SQRTFX8_F32() 229 MS_FLOAT32X8 src##1 = MS_LD256_F32(input_ptr + 0 * num); \ 230 MS_FLOAT32X8 src##2 = MS_LD256_F32(input_ptr + 1 * num); \ 231 MS_FLOAT32X8 src##3 = MS_LD256_F32(input_ptr + 2 * num); \ 232 MS_FLOAT32X8 src##4 = MS_LD256_F32(input_ptr + 3 * num); \ 233 MS_FLOAT32X8 src##5 = MS_LD256_F32(input_ptr + 4 * num); \ 234 MS_FLOAT32X8 src##6 = MS_LD256_F32(input_ptr + 5 * num); \ 235 MS_FLOAT32X8 src##7 = MS_LD256_F32(input_ptr + 6 * num); \ [all …]
|