1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "nnacl/fp32/reduce_fp32.h"
18 #include <float.h>
19 #include "nnacl/errorcode.h"
20 #include "nnacl/common_func.h"
21 #include "nnacl/reduce_fp32_simd.h"
22 #ifdef ENABLE_NNACL_INFER_SHAPE
23 #include "nnacl/reduce_parameter.h"
24 #endif
25
26 // 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
27 #define ReduceCoreCalc(op_name, op_type, outer_src, outer_dst, k) \
28 for (; k < inner_size; k++) { \
29 const op_type *inner_src = outer_src + k; \
30 op_name##PreDeal; \
31 for (int i = 0; i < axis_size; i++) { \
32 op_name##MidCalc; \
33 } \
34 op_name##PostDeal; \
35 }
36
37 #define RegReduceOp(op_name, op_type) \
38 int op_name(int outer_size, int inner_size, int axis_size, const op_type *src_data, op_type *dst_data, int tid, \
39 int thread_num) { \
40 NNACL_CHECK_TRUE_RET(src_data != NULL && dst_data != NULL, NNACL_NULL_PTR); \
41 NNACL_CHECK_TRUE_RET(thread_num > 0, NNACL_PARAM_INVALID); \
42 NNACL_CHECK_TRUE_RET(axis_size > 0, NNACL_ERR); \
43 for (int j = tid; j < outer_size; j += thread_num) { \
44 const op_type *outer_src = src_data + j * axis_size * inner_size; \
45 op_type *outer_dst = dst_data + j * inner_size; \
46 int k = 0; \
47 SIMD_RUN_NO_SCALAR(op_name, k, outer_src, outer_dst, inner_size, axis_size); \
48 \
49 ReduceCoreCalc(op_name, op_type, outer_src, outer_dst, k); \
50 } \
51 return NNACL_OK; \
52 }
53
54 // ReduceSum
55 #define ReduceSumPreDeal float tmp = 0;
56 #define ReduceSumMidCalc tmp += inner_src[i * inner_size];
57 #define ReduceSumPostDeal outer_dst[k] = tmp;
58 RegReduceOp(ReduceSum, float);
59
ReduceSumByLastAxis(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)60 int ReduceSumByLastAxis(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
61 int thread_num) {
62 NNACL_CHECK_TRUE_RET(src_data != NULL && dst_data != NULL, NNACL_NULL_PTR);
63 NNACL_CHECK_TRUE_RET(thread_num > 0, NNACL_PARAM_INVALID);
64 NNACL_CHECK_TRUE_RET(axis_size > 0, NNACL_ERR);
65
66 for (int j = tid; j < outer_size; j += thread_num) {
67 const float *src_tmp = src_data + j * axis_size;
68
69 float tmp = src_tmp[0];
70 int i = 1;
71
72 SIMD_RUN_NO_SCALAR(ReduceSumByLastAxis, i, src_tmp, &tmp, axis_size);
73 for (; i < axis_size; i++) {
74 tmp += src_tmp[i];
75 }
76 dst_data[j] = tmp;
77 }
78 return NNACL_OK;
79 }
80
81 // ReduceMean
82 #define ReduceMeanPreDeal float tmp = 0;
83 #define ReduceMeanMidCalc tmp += inner_src[i * inner_size];
84 #define ReduceMeanPostDeal outer_dst[k] = tmp / axis_size;
85 RegReduceOp(ReduceMean, float);
86
87 // ReduceMin
88 #define ReduceMinPreDeal float tmp = FLT_MAX;
89 #define ReduceMinMidCalc tmp = fminf(tmp, inner_src[i * inner_size]);
90 #define ReduceMinPostDeal outer_dst[k] = tmp;
91 RegReduceOp(ReduceMin, float);
92
93 // ReduceMax
94 #define ReduceMaxPreDeal float tmp = FLT_MIN;
95 #define ReduceMaxMidCalc tmp = fmaxf(tmp, inner_src[i * inner_size]);
96 #define ReduceMaxPostDeal outer_dst[k] = tmp;
97 RegReduceOp(ReduceMax, float);
98
ReduceMaxByLastAxis(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)99 int ReduceMaxByLastAxis(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
100 int thread_num) {
101 NNACL_CHECK_TRUE_RET(src_data != NULL && dst_data != NULL, NNACL_NULL_PTR);
102 NNACL_CHECK_TRUE_RET(thread_num > 0, NNACL_PARAM_INVALID);
103 NNACL_CHECK_TRUE_RET(axis_size > 0, NNACL_ERR);
104
105 for (int j = tid; j < outer_size; j += thread_num) {
106 const float *src_tmp = src_data + j * axis_size;
107
108 float tmp = src_tmp[0];
109 int i = 1;
110
111 SIMD_RUN_NO_SCALAR(ReduceMaxByLastAxis, i, src_tmp, &tmp, axis_size);
112 for (; i < axis_size; i++) {
113 tmp = fmaxf(tmp, src_tmp[i]);
114 }
115 dst_data[j] = tmp;
116 }
117 return NNACL_OK;
118 }
119
120 // ReduceProd
121 #define ReduceProdPreDeal float tmp = 1.0f;
122 #define ReduceProdMidCalc tmp *= inner_src[i * inner_size];
123 #define ReduceProdPostDeal outer_dst[k] = tmp;
124 RegReduceOp(ReduceProd, float);
125
126 // ReduceSumSquare
127 #define ReduceSumSquarePreDeal float tmp = 0;
128 #define ReduceSumSquareMidCalc tmp += (inner_src[i * inner_size] * inner_src[i * inner_size]);
129 #define ReduceSumSquarePostDeal outer_dst[k] = tmp;
130 RegReduceOp(ReduceSumSquare, float);
131
132 // ReduceL2Norm
133 #define ReduceL2NormPreDeal float tmp = 0;
134 #define ReduceL2NormMidCalc tmp += (inner_src[i * inner_size] * inner_src[i * inner_size]);
135 #define ReduceL2NormPostDeal outer_dst[k] = sqrt(tmp);
136 RegReduceOp(ReduceL2Norm, float);
137
138 // IntReduceSum
139 #define IntReduceSumPreDeal int tmp = 0;
140 #define IntReduceSumMidCalc tmp += inner_src[i * inner_size];
141 #define IntReduceSumPostDeal outer_dst[k] = tmp;
142 RegReduceOp(IntReduceSum, int32_t);
143
144 // IntReduceMean
145 #define IntReduceMeanPreDeal int tmp = 0;
146 #define IntReduceMeanMidCalc tmp += inner_src[i * inner_size];
147 #define IntReduceMeanPostDeal outer_dst[k] = tmp / axis_size;
148 RegReduceOp(IntReduceMean, int32_t);
149
150 // IntReduceMin
151 #define IntReduceMinPreDeal int tmp = INT32_MAX;
152 #define IntReduceMinMidCalc tmp = MSMIN(tmp, inner_src[i * inner_size]);
153 #define IntReduceMinPostDeal outer_dst[k] = tmp;
154 RegReduceOp(IntReduceMin, int32_t);
155
156 // IntReduceMax
157 #define IntReduceMaxPreDeal int tmp = INT32_MIN;
158 #define IntReduceMaxMidCalc tmp = MSMAX(tmp, inner_src[i * inner_size]);
159 #define IntReduceMaxPostDeal outer_dst[k] = tmp;
160 RegReduceOp(IntReduceMax, int32_t);
161
ReduceAll(int outer_size,int inner_size,int axis_size,const bool * src_data,bool * dst_data,int tid,int thread_num)162 int ReduceAll(int outer_size, int inner_size, int axis_size, const bool *src_data, bool *dst_data, int tid,
163 int thread_num) {
164 if (src_data == NULL || dst_data == NULL) {
165 return NNACL_NULL_PTR;
166 }
167 if (thread_num == 0) {
168 return NNACL_PARAM_INVALID;
169 }
170 int i, j, k;
171 for (j = tid; j < outer_size; j += thread_num) {
172 const bool *outer_src = src_data + j * axis_size * inner_size;
173 bool *outer_dst = dst_data + j * inner_size;
174 for (k = 0; k < inner_size; k++) {
175 const bool *inner_src = outer_src + k;
176 bool *inner_dst = outer_dst + k;
177 bool tmp = true;
178 for (i = 0; i < axis_size; i++) {
179 tmp = tmp && inner_src[i * inner_size];
180 }
181 *inner_dst = tmp;
182 }
183 }
184 return NNACL_OK;
185 }
186
IntReduceProd(int outer_size,int inner_size,int axis_size,const int32_t * src_data,int32_t * dst_data,int tid,int thread_num)187 int IntReduceProd(int outer_size, int inner_size, int axis_size, const int32_t *src_data, int32_t *dst_data, int tid,
188 int thread_num) {
189 if (src_data == NULL || dst_data == NULL) {
190 return NNACL_NULL_PTR;
191 }
192 if (thread_num == 0) {
193 return NNACL_PARAM_INVALID;
194 }
195 int i, j, k;
196 for (j = tid; j < outer_size; j += thread_num) {
197 const int32_t *outer_src = src_data + j * axis_size * inner_size;
198 int32_t *outer_dst = dst_data + j * inner_size;
199 for (k = 0; k < inner_size; k++) {
200 const int32_t *inner_src = outer_src + k;
201 int32_t *inner_dst = outer_dst + k;
202 int tmp = 1;
203 for (i = 0; i < axis_size; i++) {
204 if (isMulOverflow(tmp, inner_src[i * inner_size])) {
205 return NNACL_ERRCODE_MUL_OVERFLOW;
206 }
207 tmp *= inner_src[i * inner_size];
208 }
209 *inner_dst = tmp;
210 }
211 }
212 return NNACL_OK;
213 }
214
215 #ifdef ENABLE_NNACL_INFER_SHAPE
ReduceInferShape(int32_t ** in_shape,size_t * dim_size,int32_t * out_shape,int32_t * in_format,int32_t * out_format,int32_t * in_datatype,int32_t * out_datatype,OpParameter * param)216 int ReduceInferShape(int32_t **in_shape, size_t *dim_size, int32_t *out_shape, int32_t *in_format, int32_t *out_format,
217 int32_t *in_datatype, int32_t *out_datatype, OpParameter *param) {
218 *out_format = in_format[0];
219 *out_datatype = in_datatype[0];
220 ReduceParameter *reduce_parameter = (ReduceParameter *)param;
221 bool keep_dims = reduce_parameter->keep_dims_;
222 int num_axes = reduce_parameter->num_axes_;
223 int32_t *in_shape0 = in_shape[0];
224 int rank = dim_size[0];
225 NNACL_CHECK_TRUE_RET(rank > 0 && rank <= REDUCE_MAX_AXES_NUM, NNACL_PARAM_INVALID);
226 int axes[REDUCE_MAX_AXES_NUM];
227 int actual_axes_num = num_axes;
228 for (int i = 0; i < num_axes; ++i) {
229 NNACL_CHECK_TRUE_RET(reduce_parameter->axes_[i] >= -rank && reduce_parameter->axes_[i] < rank, NNACL_PARAM_INVALID);
230 if (reduce_parameter->axes_[i] < 0) {
231 axes[i] = reduce_parameter->axes_[i] + rank;
232 } else {
233 axes[i] = reduce_parameter->axes_[i];
234 }
235 }
236 if (reduce_parameter->reduce_to_end_) {
237 NNACL_CHECK_TRUE_RET(num_axes == 1, NNACL_PARAM_INVALID);
238 int begin_axis = axes[0];
239 num_axes = rank - begin_axis;
240 for (int i = begin_axis + 1; i < rank; ++i) {
241 axes[actual_axes_num++] = i;
242 }
243 }
244 if (num_axes == 0) {
245 int j = 0;
246 for (int i = 0; i < rank; ++i) {
247 axes[i] = i;
248 if (keep_dims) {
249 out_shape[j++] = 1;
250 }
251 }
252 reduce_parameter->num_axes_ = rank;
253 for (int i = 0; i < rank; ++i) {
254 reduce_parameter->axes_[i] = axes[i];
255 }
256 return NNACL_OK;
257 }
258 // reduce on selected axes
259 int j = 0;
260 for (int i = 0; i < rank; ++i) {
261 bool reduce_axis = false;
262 for (int idx = 0; idx < num_axes; ++idx) {
263 if (axes[idx] == i) {
264 reduce_axis = true;
265 break;
266 }
267 }
268 if (reduce_axis) {
269 if (keep_dims) {
270 out_shape[j++] = 1;
271 }
272 } else {
273 out_shape[j++] = in_shape0[i];
274 }
275 }
276 reduce_parameter->num_axes_ = num_axes;
277 for (int i = 0; i < num_axes; ++i) {
278 reduce_parameter->axes_[i] = axes[i];
279 }
280 return NNACL_OK;
281 }
282 #endif
283
284 // [A, B] -> [B]
285 // col_size : start -> end for parallel
ReduceSumDim2Axis0(size_t col_size,size_t col_len,size_t row_len,const float * src_data,float * dst_data)286 int ReduceSumDim2Axis0(size_t col_size, size_t col_len, size_t row_len, const float *src_data, float *dst_data) {
287 if (src_data == NULL || dst_data == NULL) {
288 return NNACL_NULL_PTR;
289 }
290
291 size_t k = 0;
292 SIMD_RUN_NO_SCALAR(ReduceSumDim2Axis0, k, col_size, col_len, row_len, src_data, dst_data);
293 for (; k < col_size; k++) {
294 const float *inner_src = src_data + k;
295 float *inner_dst = dst_data + k;
296 float tmp = 0.0f;
297 for (size_t i = 0; i < row_len; i++) {
298 tmp += inner_src[i * col_len];
299 }
300 *inner_dst = tmp;
301 }
302 return NNACL_OK;
303 }
304
305 // [A, B] -> [A]
ReduceSumDim2Axis1(size_t col_len,const float * src_data,float * dst_data)306 int ReduceSumDim2Axis1(size_t col_len, const float *src_data, float *dst_data) {
307 if (src_data == NULL || dst_data == NULL) {
308 return NNACL_NULL_PTR;
309 }
310 size_t k = 0;
311 float tmp = 0;
312 #ifdef ENABLE_AVX
313 size_t block_mod = col_len % C8NUM;
314 size_t block_c8 = col_len - block_mod;
315 float tmp_arr[8] = {0, 0, 0, 0, 0, 0, 0, 0};
316 MS_FLOAT32X8 tmp_arr_8 = MS_MOV256_F32(tmp_arr[0]);
317 for (; k < block_c8; k += C8NUM) {
318 MS_FLOAT32X8 src_in = MS_LD256_F32(src_data + k);
319 tmp_arr_8 = MS_ADD256_F32(tmp_arr_8, src_in);
320 }
321 MS_ST256_F32(tmp_arr, tmp_arr_8);
322 for (size_t i = 0; i < 8; ++i) {
323 tmp += tmp_arr[i];
324 }
325 #endif
326 for (; k < col_len; k++) {
327 tmp += src_data[k];
328 }
329 dst_data[0] = tmp;
330 return NNACL_OK;
331 }
332
ReduceMeanWithAxis(const float * src_data,float * mean,int64_t size)333 int ReduceMeanWithAxis(const float *src_data, float *mean, int64_t size) {
334 if (size == 0 || src_data == NULL) {
335 return NNACL_NULL_PTR;
336 }
337 float sum = 0.0;
338 int64_t i = 0;
339 SIMD_RUN_NO_SCALAR(ReduceSumByLastAxis, i, src_data, &sum, 0);
340 for (; i < size; ++i) {
341 sum += src_data[i];
342 }
343 *mean = sum / size;
344 return NNACL_OK;
345 }
346
ReduceDeviation(const float * src_data,int64_t size,float mean,float * deviation)347 int ReduceDeviation(const float *src_data, int64_t size, float mean, float *deviation) {
348 if (size == 0 || src_data == NULL) {
349 return NNACL_NULL_PTR;
350 }
351 int64_t i = 0;
352 SIMD_RUN_NO_SCALAR(FloatReduceDeviation, i, src_data, mean, size, deviation);
353 for (; i < size; ++i) {
354 float tmp = src_data[i] - mean;
355 tmp = tmp * tmp;
356 *deviation += tmp;
357 }
358 return NNACL_OK;
359 }
360