1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "nnacl/fp32/reduce_fp32.h"
18 #include <float.h>
19 #include "nnacl/errorcode.h"
20 #include "nnacl/common_func.h"
21
22 #ifdef ENABLE_NNACL_INFER_SHAPE
23 #include "nnacl/reduce_parameter.h"
24 #endif
25
ReduceMean(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)26 int ReduceMean(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
27 int thread_num) {
28 if (src_data == NULL || dst_data == NULL) {
29 return NNACL_NULL_PTR;
30 }
31 if (thread_num == 0) {
32 return NNACL_PARAM_INVALID;
33 }
34 int i, j, k;
35 for (j = tid; j < outer_size; j += thread_num) {
36 const float *outer_src = src_data + j * axis_size * inner_size;
37 float *outer_dst = dst_data + j * inner_size;
38 for (k = 0; k < inner_size; k++) {
39 const float *inner_src = outer_src + k;
40 float *inner_dst = outer_dst + k;
41 float tmp = 0.0f;
42 for (i = 0; i < axis_size; i++) {
43 tmp += inner_src[i * inner_size];
44 }
45 *inner_dst = tmp / (float)axis_size;
46 }
47 }
48 return NNACL_OK;
49 }
50
IntReduceMean(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)51 int IntReduceMean(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
52 int thread_num) {
53 if (axis_size == 0) {
54 return NNACL_ERR;
55 }
56 if (src_data == NULL || dst_data == NULL) {
57 return NNACL_NULL_PTR;
58 }
59 if (thread_num == 0) {
60 return NNACL_PARAM_INVALID;
61 }
62 NNACL_CHECK_ZERO_RETURN_ERR(axis_size);
63 int i, j;
64 #ifdef ENABLE_NEON
65 int block_mod = inner_size % C4NUM;
66 int block_c4 = inner_size - block_mod;
67 #endif
68 for (j = tid; j < outer_size; j += thread_num) {
69 const int *outer_src = src_data + j * axis_size * inner_size;
70 int *outer_dst = dst_data + j * inner_size;
71 int k = 0;
72 #ifdef ENABLE_NEON
73 for (; k < block_c4; k += C4NUM) {
74 const int *inner_src = outer_src + k;
75 int *inner_dst = outer_dst + k;
76 int32x4_t tmp = {0, 0, 0, 0};
77 for (i = 0; i < axis_size; i++) {
78 tmp = vaddq_s32(tmp, vld1q_s32(inner_src + i * inner_size));
79 }
80 tmp[0] /= axis_size;
81 tmp[1] /= axis_size;
82 tmp[2] /= axis_size;
83 tmp[3] /= axis_size;
84 vst1q_s32(inner_dst, tmp);
85 }
86 #endif
87 for (; k < inner_size; k++) {
88 const int *inner_src = outer_src + k;
89 int *inner_dst = outer_dst + k;
90 int tmp = 0;
91 for (i = 0; i < axis_size; i++) {
92 tmp += inner_src[i * inner_size];
93 }
94 *inner_dst = tmp / axis_size;
95 }
96 }
97 return NNACL_OK;
98 }
99
ReduceSum(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)100 int ReduceSum(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
101 int thread_num) {
102 if (src_data == NULL || dst_data == NULL) {
103 return NNACL_NULL_PTR;
104 }
105 if (thread_num == 0) {
106 return NNACL_PARAM_INVALID;
107 }
108 int i, j;
109 #ifdef ENABLE_NEON
110 int block_mod = inner_size % C4NUM;
111 int block_c4 = inner_size - block_mod;
112 #endif
113 for (j = tid; j < outer_size; j += thread_num) {
114 const float *outer_src = src_data + j * axis_size * inner_size;
115 float *outer_dst = dst_data + j * inner_size;
116 int k = 0;
117 #ifdef ENABLE_NEON
118 for (; k < block_c4; k += C4NUM) {
119 const float *inner_src = outer_src + k;
120 float *inner_dst = outer_dst + k;
121 float32x4_t tmp = {0, 0, 0, 0};
122 for (i = 0; i < axis_size; i++) {
123 tmp = vaddq_f32(tmp, vld1q_f32(inner_src + i * inner_size));
124 }
125 vst1q_f32(inner_dst, tmp);
126 }
127 #endif
128 for (; k < inner_size; k++) {
129 const float *inner_src = outer_src + k;
130 float *inner_dst = outer_dst + k;
131 float tmp = 0.0f;
132 for (i = 0; i < axis_size; i++) {
133 tmp += inner_src[i * inner_size];
134 }
135 *inner_dst = tmp;
136 }
137 }
138 return NNACL_OK;
139 }
140
IntReduceSum(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)141 int IntReduceSum(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
142 int thread_num) {
143 if (src_data == NULL || dst_data == NULL) {
144 return NNACL_NULL_PTR;
145 }
146 if (thread_num == 0) {
147 return NNACL_PARAM_INVALID;
148 }
149 int i, j;
150 #ifdef ENABLE_NEON
151 int block_mod = inner_size % C4NUM;
152 int block_c4 = inner_size - block_mod;
153 #endif
154 for (j = tid; j < outer_size; j += thread_num) {
155 const int *outer_src = src_data + j * axis_size * inner_size;
156 int *outer_dst = dst_data + j * inner_size;
157 int k = 0;
158 #ifdef ENABLE_NEON
159 for (; k < block_c4; k += C4NUM) {
160 const int *inner_src = outer_src + k;
161 int *inner_dst = outer_dst + k;
162 int32x4_t tmp = {0, 0, 0, 0};
163 for (i = 0; i < axis_size; i++) {
164 tmp = vaddq_s32(tmp, vld1q_s32(inner_src + i * inner_size));
165 }
166 vst1q_s32(inner_dst, tmp);
167 }
168 #endif
169 for (; k < inner_size; k++) {
170 const int *inner_src = outer_src + k;
171 int *inner_dst = outer_dst + k;
172 int tmp = 0;
173 for (i = 0; i < axis_size; i++) {
174 tmp += inner_src[i * inner_size];
175 }
176 *inner_dst = tmp;
177 }
178 }
179 return NNACL_OK;
180 }
181
ReduceMax(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)182 int ReduceMax(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
183 int thread_num) {
184 if (src_data == NULL || dst_data == NULL) {
185 return NNACL_NULL_PTR;
186 }
187 if (thread_num == 0) {
188 return NNACL_PARAM_INVALID;
189 }
190 int i, j, k;
191 for (j = tid; j < outer_size; j += thread_num) {
192 const float *outer_src = src_data + j * axis_size * inner_size;
193 float *outer_dst = dst_data + j * inner_size;
194 for (k = 0; k < inner_size; k++) {
195 const float *inner_src = outer_src + k;
196 float *inner_dst = outer_dst + k;
197 float tmp = -FLT_MAX;
198 for (i = 0; i < axis_size; i++) {
199 tmp = tmp > inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
200 }
201 *inner_dst = tmp;
202 }
203 }
204 return NNACL_OK;
205 }
206
IntReduceMax(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)207 int IntReduceMax(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
208 int thread_num) {
209 if (src_data == NULL || dst_data == NULL) {
210 return NNACL_NULL_PTR;
211 }
212 if (thread_num == 0) {
213 return NNACL_PARAM_INVALID;
214 }
215 int i, j, k;
216 for (j = tid; j < outer_size; j += thread_num) {
217 const int *outer_src = src_data + j * axis_size * inner_size;
218 int *outer_dst = dst_data + j * inner_size;
219 for (k = 0; k < inner_size; k++) {
220 const int *inner_src = outer_src + k;
221 int *inner_dst = outer_dst + k;
222 int tmp = -INT_MAX;
223 for (i = 0; i < axis_size; i++) {
224 tmp = tmp > inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
225 }
226 *inner_dst = tmp;
227 }
228 }
229 return NNACL_OK;
230 }
231
ReduceMin(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)232 int ReduceMin(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
233 int thread_num) {
234 if (src_data == NULL || dst_data == NULL) {
235 return NNACL_NULL_PTR;
236 }
237 if (thread_num == 0) {
238 return NNACL_PARAM_INVALID;
239 }
240 int i, j, k;
241 for (j = tid; j < outer_size; j += thread_num) {
242 const float *outer_src = src_data + j * axis_size * inner_size;
243 float *outer_dst = dst_data + j * inner_size;
244 for (k = 0; k < inner_size; k++) {
245 const float *inner_src = outer_src + k;
246 float *inner_dst = outer_dst + k;
247 float tmp = FLT_MAX;
248 for (i = 0; i < axis_size; i++) {
249 tmp = tmp < inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
250 }
251 *inner_dst = tmp;
252 }
253 }
254 return NNACL_OK;
255 }
256
IntReduceMin(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)257 int IntReduceMin(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
258 int thread_num) {
259 if (src_data == NULL || dst_data == NULL) {
260 return NNACL_NULL_PTR;
261 }
262 if (thread_num == 0) {
263 return NNACL_PARAM_INVALID;
264 }
265 int i, j, k;
266 for (j = tid; j < outer_size; j += thread_num) {
267 const int *outer_src = src_data + j * axis_size * inner_size;
268 int *outer_dst = dst_data + j * inner_size;
269 for (k = 0; k < inner_size; k++) {
270 const int *inner_src = outer_src + k;
271 int *inner_dst = outer_dst + k;
272 int tmp = INT32_MAX;
273 for (i = 0; i < axis_size; i++) {
274 tmp = tmp < inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
275 }
276 *inner_dst = tmp;
277 }
278 }
279 return NNACL_OK;
280 }
281
ReduceAll(int outer_size,int inner_size,int axis_size,const bool * src_data,bool * dst_data,int tid,int thread_num)282 int ReduceAll(int outer_size, int inner_size, int axis_size, const bool *src_data, bool *dst_data, int tid,
283 int thread_num) {
284 if (src_data == NULL || dst_data == NULL) {
285 return NNACL_NULL_PTR;
286 }
287 if (thread_num == 0) {
288 return NNACL_PARAM_INVALID;
289 }
290 int i, j, k;
291 for (j = tid; j < outer_size; j += thread_num) {
292 const bool *outer_src = src_data + j * axis_size * inner_size;
293 bool *outer_dst = dst_data + j * inner_size;
294 for (k = 0; k < inner_size; k++) {
295 const bool *inner_src = outer_src + k;
296 bool *inner_dst = outer_dst + k;
297 bool tmp = true;
298 for (i = 0; i < axis_size; i++) {
299 tmp = tmp && inner_src[i * inner_size];
300 }
301 *inner_dst = tmp;
302 }
303 }
304 return NNACL_OK;
305 }
306
ReduceProd(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)307 int ReduceProd(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
308 int thread_num) {
309 if (src_data == NULL || dst_data == NULL) {
310 return NNACL_NULL_PTR;
311 }
312 if (thread_num == 0) {
313 return NNACL_PARAM_INVALID;
314 }
315 int i, j, k;
316 for (j = tid; j < outer_size; j += thread_num) {
317 const float *outer_src = src_data + j * axis_size * inner_size;
318 float *outer_dst = dst_data + j * inner_size;
319 for (k = 0; k < inner_size; k++) {
320 const float *inner_src = outer_src + k;
321 float *inner_dst = outer_dst + k;
322 float tmp = 1.0f;
323 for (i = 0; i < axis_size; i++) {
324 tmp *= inner_src[i * inner_size];
325 }
326 *inner_dst = tmp;
327 }
328 }
329 return NNACL_OK;
330 }
331
IntReduceProd(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)332 int IntReduceProd(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
333 int thread_num) {
334 if (src_data == NULL || dst_data == NULL) {
335 return NNACL_NULL_PTR;
336 }
337 if (thread_num == 0) {
338 return NNACL_PARAM_INVALID;
339 }
340 int i, j, k;
341 for (j = tid; j < outer_size; j += thread_num) {
342 const int *outer_src = src_data + j * axis_size * inner_size;
343 int *outer_dst = dst_data + j * inner_size;
344 for (k = 0; k < inner_size; k++) {
345 const int *inner_src = outer_src + k;
346 int *inner_dst = outer_dst + k;
347 int tmp = 1;
348 for (i = 0; i < axis_size; i++) {
349 if (isMulOverflow(tmp, inner_src[i * inner_size])) {
350 return NNACL_ERRCODE_MUL_OVERFLOW;
351 }
352 tmp *= inner_src[i * inner_size];
353 }
354 *inner_dst = tmp;
355 }
356 }
357 return NNACL_OK;
358 }
359
ReduceSumSquare(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)360 int ReduceSumSquare(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
361 int thread_num) {
362 if (src_data == NULL || dst_data == NULL) {
363 return NNACL_NULL_PTR;
364 }
365 if (thread_num == 0) {
366 return NNACL_PARAM_INVALID;
367 }
368 int i, j, k;
369 for (j = tid; j < outer_size; j += thread_num) {
370 const float *outer_src = src_data + j * axis_size * inner_size;
371 float *outer_dst = dst_data + j * inner_size;
372 for (k = 0; k < inner_size; k++) {
373 const float *inner_src = outer_src + k;
374 float *inner_dst = outer_dst + k;
375 float tmp = 0.0f;
376 for (i = 0; i < axis_size; i++) {
377 tmp += inner_src[i * inner_size] * inner_src[i * inner_size];
378 }
379 *inner_dst = tmp;
380 }
381 }
382 return NNACL_OK;
383 }
384
385 #ifdef ENABLE_NNACL_INFER_SHAPE
ReduceInferShape(int ** in_shape,size_t * dim_size,int * out_shape,int * in_format,int * out_format,int * in_datatype,int * out_datatype,OpParameter * param)386 int ReduceInferShape(int **in_shape, size_t *dim_size, int *out_shape, int *in_format, int *out_format,
387 int *in_datatype, int *out_datatype, OpParameter *param) {
388 *out_format = in_format[0];
389 *out_datatype = in_datatype[0];
390 ReduceParameter *reduce_parameter = (ReduceParameter *)param;
391 bool keep_dims = reduce_parameter->keep_dims_;
392 int num_axes = reduce_parameter->num_axes_;
393 int *in_shape0 = in_shape[0];
394 int rank = dim_size[0];
395 if (rank <= 0 || rank > REDUCE_MAX_AXES_NUM) {
396 return NNACL_PARAM_INVALID;
397 }
398 int axes[REDUCE_MAX_AXES_NUM];
399 int actual_axes_num = num_axes;
400 for (int i = 0; i < num_axes; ++i) {
401 if (reduce_parameter->axes_[i] < -rank || reduce_parameter->axes_[i] >= rank) {
402 return NNACL_PARAM_INVALID;
403 }
404 if (reduce_parameter->axes_[i] < 0) {
405 axes[i] = reduce_parameter->axes_[i] + rank;
406 } else {
407 axes[i] = reduce_parameter->axes_[i];
408 }
409 }
410 if (reduce_parameter->reduce_to_end_) {
411 if (num_axes != 1) {
412 return NNACL_PARAM_INVALID;
413 }
414 int begin_axis = axes[0];
415 num_axes = rank - begin_axis;
416 for (int i = begin_axis + 1; i < rank; ++i) {
417 axes[actual_axes_num++] = i;
418 }
419 }
420 if (num_axes == 0) {
421 int j = 0;
422 for (int i = 0; i < rank; ++i) {
423 axes[i] = i;
424 if (keep_dims) {
425 out_shape[j++] = 1;
426 }
427 }
428 reduce_parameter->num_axes_ = rank;
429 for (int i = 0; i < rank; ++i) {
430 reduce_parameter->axes_[i] = axes[i];
431 }
432 return NNACL_OK;
433 }
434 // reduce on selected axes
435 int j = 0;
436 for (int i = 0; i < rank; ++i) {
437 bool reduce_axis = false;
438 for (int idx = 0; idx < num_axes; ++idx) {
439 if (axes[idx] == i) {
440 reduce_axis = true;
441 break;
442 }
443 }
444 if (reduce_axis) {
445 if (keep_dims) {
446 out_shape[j++] = 1;
447 }
448 } else {
449 out_shape[j++] = in_shape0[i];
450 }
451 }
452 reduce_parameter->num_axes_ = num_axes;
453 for (int i = 0; i < num_axes; ++i) {
454 reduce_parameter->axes_[i] = axes[i];
455 }
456 return NNACL_OK;
457 }
458 #endif
459
460 // [A, B] -> [B]
461 // col_size : start -> end for parallel
ReduceSumDim2Axis0(size_t col_size,size_t col_len,size_t row_len,const float * src_data,float * dst_data)462 int ReduceSumDim2Axis0(size_t col_size, size_t col_len, size_t row_len, const float *src_data, float *dst_data) {
463 if (src_data == NULL || dst_data == NULL) {
464 return NNACL_NULL_PTR;
465 }
466 #ifdef ENABLE_AVX
467 size_t block_mod = col_size % C8NUM;
468 size_t block_c8 = col_size - block_mod;
469 #endif
470 size_t k = 0;
471 #ifdef ENABLE_AVX
472 for (; k < block_c8; k += C8NUM) {
473 MS_FLOAT32X8 tmp = {0, 0, 0, 0, 0, 0, 0, 0};
474 const float *inner_src = src_data + k;
475 float *inner_dst = dst_data + k;
476 for (size_t i = 0; i < row_len; ++i) {
477 tmp = MS_ADD256_F32(tmp, MS_LD256_F32(inner_src + i * col_len));
478 }
479 MS_ST256_F32(inner_dst, tmp);
480 }
481 #endif
482 for (; k < col_size; k++) {
483 const float *inner_src = src_data + k;
484 float *inner_dst = dst_data + k;
485 float tmp = 0.0f;
486 for (size_t i = 0; i < row_len; i++) {
487 tmp += inner_src[i * col_len];
488 }
489 *inner_dst = tmp;
490 }
491 return NNACL_OK;
492 }
493
494 // [A, B] -> [A]
ReduceSumDim2Axis1(size_t col_len,const float * src_data,float * dst_data)495 int ReduceSumDim2Axis1(size_t col_len, const float *src_data, float *dst_data) {
496 if (src_data == NULL || dst_data == NULL) {
497 return NNACL_NULL_PTR;
498 }
499 size_t k = 0;
500 float tmp = 0;
501 #ifdef ENABLE_AVX
502 size_t block_mod = col_len % C8NUM;
503 size_t block_c8 = col_len - block_mod;
504 float tmp_arr[8] = {0, 0, 0, 0, 0, 0, 0, 0};
505 MS_FLOAT32X8 tmp_arr_8 = MS_MOV256_F32(tmp_arr[0]);
506 for (; k < block_c8; k += C8NUM) {
507 MS_FLOAT32X8 src_in = MS_LD256_F32(src_data + k);
508 tmp_arr_8 = MS_ADD256_F32(tmp_arr_8, src_in);
509 }
510 MS_ST256_F32(tmp_arr, tmp_arr_8);
511 for (size_t i = 0; i < 8; ++i) {
512 tmp += tmp_arr[i];
513 }
514 #endif
515 for (; k < col_len; k++) {
516 tmp += src_data[k];
517 }
518 dst_data[0] = tmp;
519 return NNACL_OK;
520 }
521