• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "nnacl/fp32/reduce_fp32.h"
18 #include <float.h>
19 #include "nnacl/errorcode.h"
20 #include "nnacl/common_func.h"
21 
22 #ifdef ENABLE_NNACL_INFER_SHAPE
23 #include "nnacl/reduce_parameter.h"
24 #endif
25 
ReduceMean(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)26 int ReduceMean(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
27                int thread_num) {
28   if (src_data == NULL || dst_data == NULL) {
29     return NNACL_NULL_PTR;
30   }
31   if (thread_num == 0) {
32     return NNACL_PARAM_INVALID;
33   }
34   int i, j, k;
35   for (j = tid; j < outer_size; j += thread_num) {
36     const float *outer_src = src_data + j * axis_size * inner_size;
37     float *outer_dst = dst_data + j * inner_size;
38     for (k = 0; k < inner_size; k++) {
39       const float *inner_src = outer_src + k;
40       float *inner_dst = outer_dst + k;
41       float tmp = 0.0f;
42       for (i = 0; i < axis_size; i++) {
43         tmp += inner_src[i * inner_size];
44       }
45       *inner_dst = tmp / (float)axis_size;
46     }
47   }
48   return NNACL_OK;
49 }
50 
IntReduceMean(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)51 int IntReduceMean(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
52                   int thread_num) {
53   if (axis_size == 0) {
54     return NNACL_ERR;
55   }
56   if (src_data == NULL || dst_data == NULL) {
57     return NNACL_NULL_PTR;
58   }
59   if (thread_num == 0) {
60     return NNACL_PARAM_INVALID;
61   }
62   NNACL_CHECK_ZERO_RETURN_ERR(axis_size);
63   int i, j;
64 #ifdef ENABLE_NEON
65   int block_mod = inner_size % C4NUM;
66   int block_c4 = inner_size - block_mod;
67 #endif
68   for (j = tid; j < outer_size; j += thread_num) {
69     const int *outer_src = src_data + j * axis_size * inner_size;
70     int *outer_dst = dst_data + j * inner_size;
71     int k = 0;
72 #ifdef ENABLE_NEON
73     for (; k < block_c4; k += C4NUM) {
74       const int *inner_src = outer_src + k;
75       int *inner_dst = outer_dst + k;
76       int32x4_t tmp = {0, 0, 0, 0};
77       for (i = 0; i < axis_size; i++) {
78         tmp = vaddq_s32(tmp, vld1q_s32(inner_src + i * inner_size));
79       }
80       tmp[0] /= axis_size;
81       tmp[1] /= axis_size;
82       tmp[2] /= axis_size;
83       tmp[3] /= axis_size;
84       vst1q_s32(inner_dst, tmp);
85     }
86 #endif
87     for (; k < inner_size; k++) {
88       const int *inner_src = outer_src + k;
89       int *inner_dst = outer_dst + k;
90       int tmp = 0;
91       for (i = 0; i < axis_size; i++) {
92         tmp += inner_src[i * inner_size];
93       }
94       *inner_dst = tmp / axis_size;
95     }
96   }
97   return NNACL_OK;
98 }
99 
ReduceSum(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)100 int ReduceSum(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
101               int thread_num) {
102   if (src_data == NULL || dst_data == NULL) {
103     return NNACL_NULL_PTR;
104   }
105   if (thread_num == 0) {
106     return NNACL_PARAM_INVALID;
107   }
108   int i, j;
109 #ifdef ENABLE_NEON
110   int block_mod = inner_size % C4NUM;
111   int block_c4 = inner_size - block_mod;
112 #endif
113   for (j = tid; j < outer_size; j += thread_num) {
114     const float *outer_src = src_data + j * axis_size * inner_size;
115     float *outer_dst = dst_data + j * inner_size;
116     int k = 0;
117 #ifdef ENABLE_NEON
118     for (; k < block_c4; k += C4NUM) {
119       const float *inner_src = outer_src + k;
120       float *inner_dst = outer_dst + k;
121       float32x4_t tmp = {0, 0, 0, 0};
122       for (i = 0; i < axis_size; i++) {
123         tmp = vaddq_f32(tmp, vld1q_f32(inner_src + i * inner_size));
124       }
125       vst1q_f32(inner_dst, tmp);
126     }
127 #endif
128     for (; k < inner_size; k++) {
129       const float *inner_src = outer_src + k;
130       float *inner_dst = outer_dst + k;
131       float tmp = 0.0f;
132       for (i = 0; i < axis_size; i++) {
133         tmp += inner_src[i * inner_size];
134       }
135       *inner_dst = tmp;
136     }
137   }
138   return NNACL_OK;
139 }
140 
IntReduceSum(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)141 int IntReduceSum(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
142                  int thread_num) {
143   if (src_data == NULL || dst_data == NULL) {
144     return NNACL_NULL_PTR;
145   }
146   if (thread_num == 0) {
147     return NNACL_PARAM_INVALID;
148   }
149   int i, j;
150 #ifdef ENABLE_NEON
151   int block_mod = inner_size % C4NUM;
152   int block_c4 = inner_size - block_mod;
153 #endif
154   for (j = tid; j < outer_size; j += thread_num) {
155     const int *outer_src = src_data + j * axis_size * inner_size;
156     int *outer_dst = dst_data + j * inner_size;
157     int k = 0;
158 #ifdef ENABLE_NEON
159     for (; k < block_c4; k += C4NUM) {
160       const int *inner_src = outer_src + k;
161       int *inner_dst = outer_dst + k;
162       int32x4_t tmp = {0, 0, 0, 0};
163       for (i = 0; i < axis_size; i++) {
164         tmp = vaddq_s32(tmp, vld1q_s32(inner_src + i * inner_size));
165       }
166       vst1q_s32(inner_dst, tmp);
167     }
168 #endif
169     for (; k < inner_size; k++) {
170       const int *inner_src = outer_src + k;
171       int *inner_dst = outer_dst + k;
172       int tmp = 0;
173       for (i = 0; i < axis_size; i++) {
174         tmp += inner_src[i * inner_size];
175       }
176       *inner_dst = tmp;
177     }
178   }
179   return NNACL_OK;
180 }
181 
ReduceMax(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)182 int ReduceMax(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
183               int thread_num) {
184   if (src_data == NULL || dst_data == NULL) {
185     return NNACL_NULL_PTR;
186   }
187   if (thread_num == 0) {
188     return NNACL_PARAM_INVALID;
189   }
190   int i, j, k;
191   for (j = tid; j < outer_size; j += thread_num) {
192     const float *outer_src = src_data + j * axis_size * inner_size;
193     float *outer_dst = dst_data + j * inner_size;
194     for (k = 0; k < inner_size; k++) {
195       const float *inner_src = outer_src + k;
196       float *inner_dst = outer_dst + k;
197       float tmp = -FLT_MAX;
198       for (i = 0; i < axis_size; i++) {
199         tmp = tmp > inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
200       }
201       *inner_dst = tmp;
202     }
203   }
204   return NNACL_OK;
205 }
206 
IntReduceMax(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)207 int IntReduceMax(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
208                  int thread_num) {
209   if (src_data == NULL || dst_data == NULL) {
210     return NNACL_NULL_PTR;
211   }
212   if (thread_num == 0) {
213     return NNACL_PARAM_INVALID;
214   }
215   int i, j, k;
216   for (j = tid; j < outer_size; j += thread_num) {
217     const int *outer_src = src_data + j * axis_size * inner_size;
218     int *outer_dst = dst_data + j * inner_size;
219     for (k = 0; k < inner_size; k++) {
220       const int *inner_src = outer_src + k;
221       int *inner_dst = outer_dst + k;
222       int tmp = -INT_MAX;
223       for (i = 0; i < axis_size; i++) {
224         tmp = tmp > inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
225       }
226       *inner_dst = tmp;
227     }
228   }
229   return NNACL_OK;
230 }
231 
ReduceMin(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)232 int ReduceMin(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
233               int thread_num) {
234   if (src_data == NULL || dst_data == NULL) {
235     return NNACL_NULL_PTR;
236   }
237   if (thread_num == 0) {
238     return NNACL_PARAM_INVALID;
239   }
240   int i, j, k;
241   for (j = tid; j < outer_size; j += thread_num) {
242     const float *outer_src = src_data + j * axis_size * inner_size;
243     float *outer_dst = dst_data + j * inner_size;
244     for (k = 0; k < inner_size; k++) {
245       const float *inner_src = outer_src + k;
246       float *inner_dst = outer_dst + k;
247       float tmp = FLT_MAX;
248       for (i = 0; i < axis_size; i++) {
249         tmp = tmp < inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
250       }
251       *inner_dst = tmp;
252     }
253   }
254   return NNACL_OK;
255 }
256 
IntReduceMin(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)257 int IntReduceMin(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
258                  int thread_num) {
259   if (src_data == NULL || dst_data == NULL) {
260     return NNACL_NULL_PTR;
261   }
262   if (thread_num == 0) {
263     return NNACL_PARAM_INVALID;
264   }
265   int i, j, k;
266   for (j = tid; j < outer_size; j += thread_num) {
267     const int *outer_src = src_data + j * axis_size * inner_size;
268     int *outer_dst = dst_data + j * inner_size;
269     for (k = 0; k < inner_size; k++) {
270       const int *inner_src = outer_src + k;
271       int *inner_dst = outer_dst + k;
272       int tmp = INT32_MAX;
273       for (i = 0; i < axis_size; i++) {
274         tmp = tmp < inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
275       }
276       *inner_dst = tmp;
277     }
278   }
279   return NNACL_OK;
280 }
281 
ReduceAll(int outer_size,int inner_size,int axis_size,const bool * src_data,bool * dst_data,int tid,int thread_num)282 int ReduceAll(int outer_size, int inner_size, int axis_size, const bool *src_data, bool *dst_data, int tid,
283               int thread_num) {
284   if (src_data == NULL || dst_data == NULL) {
285     return NNACL_NULL_PTR;
286   }
287   if (thread_num == 0) {
288     return NNACL_PARAM_INVALID;
289   }
290   int i, j, k;
291   for (j = tid; j < outer_size; j += thread_num) {
292     const bool *outer_src = src_data + j * axis_size * inner_size;
293     bool *outer_dst = dst_data + j * inner_size;
294     for (k = 0; k < inner_size; k++) {
295       const bool *inner_src = outer_src + k;
296       bool *inner_dst = outer_dst + k;
297       bool tmp = true;
298       for (i = 0; i < axis_size; i++) {
299         tmp = tmp && inner_src[i * inner_size];
300       }
301       *inner_dst = tmp;
302     }
303   }
304   return NNACL_OK;
305 }
306 
ReduceProd(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)307 int ReduceProd(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
308                int thread_num) {
309   if (src_data == NULL || dst_data == NULL) {
310     return NNACL_NULL_PTR;
311   }
312   if (thread_num == 0) {
313     return NNACL_PARAM_INVALID;
314   }
315   int i, j, k;
316   for (j = tid; j < outer_size; j += thread_num) {
317     const float *outer_src = src_data + j * axis_size * inner_size;
318     float *outer_dst = dst_data + j * inner_size;
319     for (k = 0; k < inner_size; k++) {
320       const float *inner_src = outer_src + k;
321       float *inner_dst = outer_dst + k;
322       float tmp = 1.0f;
323       for (i = 0; i < axis_size; i++) {
324         tmp *= inner_src[i * inner_size];
325       }
326       *inner_dst = tmp;
327     }
328   }
329   return NNACL_OK;
330 }
331 
IntReduceProd(int outer_size,int inner_size,int axis_size,const int * src_data,int * dst_data,int tid,int thread_num)332 int IntReduceProd(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid,
333                   int thread_num) {
334   if (src_data == NULL || dst_data == NULL) {
335     return NNACL_NULL_PTR;
336   }
337   if (thread_num == 0) {
338     return NNACL_PARAM_INVALID;
339   }
340   int i, j, k;
341   for (j = tid; j < outer_size; j += thread_num) {
342     const int *outer_src = src_data + j * axis_size * inner_size;
343     int *outer_dst = dst_data + j * inner_size;
344     for (k = 0; k < inner_size; k++) {
345       const int *inner_src = outer_src + k;
346       int *inner_dst = outer_dst + k;
347       int tmp = 1;
348       for (i = 0; i < axis_size; i++) {
349         if (isMulOverflow(tmp, inner_src[i * inner_size])) {
350           return NNACL_ERRCODE_MUL_OVERFLOW;
351         }
352         tmp *= inner_src[i * inner_size];
353       }
354       *inner_dst = tmp;
355     }
356   }
357   return NNACL_OK;
358 }
359 
ReduceSumSquare(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)360 int ReduceSumSquare(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
361                     int thread_num) {
362   if (src_data == NULL || dst_data == NULL) {
363     return NNACL_NULL_PTR;
364   }
365   if (thread_num == 0) {
366     return NNACL_PARAM_INVALID;
367   }
368   int i, j, k;
369   for (j = tid; j < outer_size; j += thread_num) {
370     const float *outer_src = src_data + j * axis_size * inner_size;
371     float *outer_dst = dst_data + j * inner_size;
372     for (k = 0; k < inner_size; k++) {
373       const float *inner_src = outer_src + k;
374       float *inner_dst = outer_dst + k;
375       float tmp = 0.0f;
376       for (i = 0; i < axis_size; i++) {
377         tmp += inner_src[i * inner_size] * inner_src[i * inner_size];
378       }
379       *inner_dst = tmp;
380     }
381   }
382   return NNACL_OK;
383 }
384 
385 #ifdef ENABLE_NNACL_INFER_SHAPE
ReduceInferShape(int ** in_shape,size_t * dim_size,int * out_shape,int * in_format,int * out_format,int * in_datatype,int * out_datatype,OpParameter * param)386 int ReduceInferShape(int **in_shape, size_t *dim_size, int *out_shape, int *in_format, int *out_format,
387                      int *in_datatype, int *out_datatype, OpParameter *param) {
388   *out_format = in_format[0];
389   *out_datatype = in_datatype[0];
390   ReduceParameter *reduce_parameter = (ReduceParameter *)param;
391   bool keep_dims = reduce_parameter->keep_dims_;
392   int num_axes = reduce_parameter->num_axes_;
393   int *in_shape0 = in_shape[0];
394   int rank = dim_size[0];
395   if (rank <= 0 || rank > REDUCE_MAX_AXES_NUM) {
396     return NNACL_PARAM_INVALID;
397   }
398   int axes[REDUCE_MAX_AXES_NUM];
399   int actual_axes_num = num_axes;
400   for (int i = 0; i < num_axes; ++i) {
401     if (reduce_parameter->axes_[i] < -rank || reduce_parameter->axes_[i] >= rank) {
402       return NNACL_PARAM_INVALID;
403     }
404     if (reduce_parameter->axes_[i] < 0) {
405       axes[i] = reduce_parameter->axes_[i] + rank;
406     } else {
407       axes[i] = reduce_parameter->axes_[i];
408     }
409   }
410   if (reduce_parameter->reduce_to_end_) {
411     if (num_axes != 1) {
412       return NNACL_PARAM_INVALID;
413     }
414     int begin_axis = axes[0];
415     num_axes = rank - begin_axis;
416     for (int i = begin_axis + 1; i < rank; ++i) {
417       axes[actual_axes_num++] = i;
418     }
419   }
420   if (num_axes == 0) {
421     int j = 0;
422     for (int i = 0; i < rank; ++i) {
423       axes[i] = i;
424       if (keep_dims) {
425         out_shape[j++] = 1;
426       }
427     }
428     reduce_parameter->num_axes_ = rank;
429     for (int i = 0; i < rank; ++i) {
430       reduce_parameter->axes_[i] = axes[i];
431     }
432     return NNACL_OK;
433   }
434   // reduce on selected axes
435   int j = 0;
436   for (int i = 0; i < rank; ++i) {
437     bool reduce_axis = false;
438     for (int idx = 0; idx < num_axes; ++idx) {
439       if (axes[idx] == i) {
440         reduce_axis = true;
441         break;
442       }
443     }
444     if (reduce_axis) {
445       if (keep_dims) {
446         out_shape[j++] = 1;
447       }
448     } else {
449       out_shape[j++] = in_shape0[i];
450     }
451   }
452   reduce_parameter->num_axes_ = num_axes;
453   for (int i = 0; i < num_axes; ++i) {
454     reduce_parameter->axes_[i] = axes[i];
455   }
456   return NNACL_OK;
457 }
458 #endif
459 
460 // [A, B] -> [B]
461 // col_size : start -> end for parallel
ReduceSumDim2Axis0(size_t col_size,size_t col_len,size_t row_len,const float * src_data,float * dst_data)462 int ReduceSumDim2Axis0(size_t col_size, size_t col_len, size_t row_len, const float *src_data, float *dst_data) {
463   if (src_data == NULL || dst_data == NULL) {
464     return NNACL_NULL_PTR;
465   }
466 #ifdef ENABLE_AVX
467   size_t block_mod = col_size % C8NUM;
468   size_t block_c8 = col_size - block_mod;
469 #endif
470   size_t k = 0;
471 #ifdef ENABLE_AVX
472   for (; k < block_c8; k += C8NUM) {
473     MS_FLOAT32X8 tmp = {0, 0, 0, 0, 0, 0, 0, 0};
474     const float *inner_src = src_data + k;
475     float *inner_dst = dst_data + k;
476     for (size_t i = 0; i < row_len; ++i) {
477       tmp = MS_ADD256_F32(tmp, MS_LD256_F32(inner_src + i * col_len));
478     }
479     MS_ST256_F32(inner_dst, tmp);
480   }
481 #endif
482   for (; k < col_size; k++) {
483     const float *inner_src = src_data + k;
484     float *inner_dst = dst_data + k;
485     float tmp = 0.0f;
486     for (size_t i = 0; i < row_len; i++) {
487       tmp += inner_src[i * col_len];
488     }
489     *inner_dst = tmp;
490   }
491   return NNACL_OK;
492 }
493 
494 // [A, B] -> [A]
ReduceSumDim2Axis1(size_t col_len,const float * src_data,float * dst_data)495 int ReduceSumDim2Axis1(size_t col_len, const float *src_data, float *dst_data) {
496   if (src_data == NULL || dst_data == NULL) {
497     return NNACL_NULL_PTR;
498   }
499   size_t k = 0;
500   float tmp = 0;
501 #ifdef ENABLE_AVX
502   size_t block_mod = col_len % C8NUM;
503   size_t block_c8 = col_len - block_mod;
504   float tmp_arr[8] = {0, 0, 0, 0, 0, 0, 0, 0};
505   MS_FLOAT32X8 tmp_arr_8 = MS_MOV256_F32(tmp_arr[0]);
506   for (; k < block_c8; k += C8NUM) {
507     MS_FLOAT32X8 src_in = MS_LD256_F32(src_data + k);
508     tmp_arr_8 = MS_ADD256_F32(tmp_arr_8, src_in);
509   }
510   MS_ST256_F32(tmp_arr, tmp_arr_8);
511   for (size_t i = 0; i < 8; ++i) {
512     tmp += tmp_arr[i];
513   }
514 #endif
515   for (; k < col_len; k++) {
516     tmp += src_data[k];
517   }
518   dst_data[0] = tmp;
519   return NNACL_OK;
520 }
521