• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "nnacl/fp32/reduce_fp32.h"
18 #include <float.h>
19 #include "nnacl/errorcode.h"
20 #include "nnacl/common_func.h"
21 #include "nnacl/reduce_fp32_simd.h"
22 #ifdef ENABLE_NNACL_INFER_SHAPE
23 #include "nnacl/reduce_parameter.h"
24 #endif
25 
26 // 32 bits, block_size : (512/256/128/32), block_num : (16/8/4/1)
27 #define ReduceCoreCalc(op_name, op_type, outer_src, outer_dst, k) \
28   for (; k < inner_size; k++) {                                   \
29     const op_type *inner_src = outer_src + k;                     \
30     op_name##PreDeal;                                             \
31     for (int i = 0; i < axis_size; i++) {                         \
32       op_name##MidCalc;                                           \
33     }                                                             \
34     op_name##PostDeal;                                            \
35   }
36 
37 #define RegReduceOp(op_name, op_type)                                                                             \
38   int op_name(int outer_size, int inner_size, int axis_size, const op_type *src_data, op_type *dst_data, int tid, \
39               int thread_num) {                                                                                   \
40     NNACL_CHECK_TRUE_RET(src_data != NULL && dst_data != NULL, NNACL_NULL_PTR);                                   \
41     NNACL_CHECK_TRUE_RET(thread_num > 0, NNACL_PARAM_INVALID);                                                    \
42     NNACL_CHECK_TRUE_RET(axis_size > 0, NNACL_ERR);                                                               \
43     for (int j = tid; j < outer_size; j += thread_num) {                                                          \
44       const op_type *outer_src = src_data + j * axis_size * inner_size;                                           \
45       op_type *outer_dst = dst_data + j * inner_size;                                                             \
46       int k = 0;                                                                                                  \
47       SIMD_RUN_NO_SCALAR(op_name, k, outer_src, outer_dst, inner_size, axis_size);                                \
48                                                                                                                   \
49       ReduceCoreCalc(op_name, op_type, outer_src, outer_dst, k);                                                  \
50     }                                                                                                             \
51     return NNACL_OK;                                                                                              \
52   }
53 
54 // ReduceSum
55 #define ReduceSumPreDeal float tmp = 0;
56 #define ReduceSumMidCalc tmp += inner_src[i * inner_size];
57 #define ReduceSumPostDeal outer_dst[k] = tmp;
58 RegReduceOp(ReduceSum, float);
59 
ReduceSumByLastAxis(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)60 int ReduceSumByLastAxis(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
61                         int thread_num) {
62   NNACL_CHECK_TRUE_RET(src_data != NULL && dst_data != NULL, NNACL_NULL_PTR);
63   NNACL_CHECK_TRUE_RET(thread_num > 0, NNACL_PARAM_INVALID);
64   NNACL_CHECK_TRUE_RET(axis_size > 0, NNACL_ERR);
65 
66   for (int j = tid; j < outer_size; j += thread_num) {
67     const float *src_tmp = src_data + j * axis_size;
68 
69     float tmp = src_tmp[0];
70     int i = 1;
71 
72     SIMD_RUN_NO_SCALAR(ReduceSumByLastAxis, i, src_tmp, &tmp, axis_size);
73     for (; i < axis_size; i++) {
74       tmp += src_tmp[i];
75     }
76     dst_data[j] = tmp;
77   }
78   return NNACL_OK;
79 }
80 
81 // ReduceMean
82 #define ReduceMeanPreDeal float tmp = 0;
83 #define ReduceMeanMidCalc tmp += inner_src[i * inner_size];
84 #define ReduceMeanPostDeal outer_dst[k] = tmp / axis_size;
85 RegReduceOp(ReduceMean, float);
86 
87 // ReduceMin
88 #define ReduceMinPreDeal float tmp = FLT_MAX;
89 #define ReduceMinMidCalc tmp = fminf(tmp, inner_src[i * inner_size]);
90 #define ReduceMinPostDeal outer_dst[k] = tmp;
91 RegReduceOp(ReduceMin, float);
92 
93 // ReduceMax
94 #define ReduceMaxPreDeal float tmp = FLT_MIN;
95 #define ReduceMaxMidCalc tmp = fmaxf(tmp, inner_src[i * inner_size]);
96 #define ReduceMaxPostDeal outer_dst[k] = tmp;
97 RegReduceOp(ReduceMax, float);
98 
ReduceMaxByLastAxis(int outer_size,int inner_size,int axis_size,const float * src_data,float * dst_data,int tid,int thread_num)99 int ReduceMaxByLastAxis(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid,
100                         int thread_num) {
101   NNACL_CHECK_TRUE_RET(src_data != NULL && dst_data != NULL, NNACL_NULL_PTR);
102   NNACL_CHECK_TRUE_RET(thread_num > 0, NNACL_PARAM_INVALID);
103   NNACL_CHECK_TRUE_RET(axis_size > 0, NNACL_ERR);
104 
105   for (int j = tid; j < outer_size; j += thread_num) {
106     const float *src_tmp = src_data + j * axis_size;
107 
108     float tmp = src_tmp[0];
109     int i = 1;
110 
111     SIMD_RUN_NO_SCALAR(ReduceMaxByLastAxis, i, src_tmp, &tmp, axis_size);
112     for (; i < axis_size; i++) {
113       tmp = fmaxf(tmp, src_tmp[i]);
114     }
115     dst_data[j] = tmp;
116   }
117   return NNACL_OK;
118 }
119 
120 // ReduceProd
121 #define ReduceProdPreDeal float tmp = 1.0f;
122 #define ReduceProdMidCalc tmp *= inner_src[i * inner_size];
123 #define ReduceProdPostDeal outer_dst[k] = tmp;
124 RegReduceOp(ReduceProd, float);
125 
126 // ReduceSumSquare
127 #define ReduceSumSquarePreDeal float tmp = 0;
128 #define ReduceSumSquareMidCalc tmp += (inner_src[i * inner_size] * inner_src[i * inner_size]);
129 #define ReduceSumSquarePostDeal outer_dst[k] = tmp;
130 RegReduceOp(ReduceSumSquare, float);
131 
132 // ReduceL2Norm
133 #define ReduceL2NormPreDeal float tmp = 0;
134 #define ReduceL2NormMidCalc tmp += (inner_src[i * inner_size] * inner_src[i * inner_size]);
135 #define ReduceL2NormPostDeal outer_dst[k] = sqrt(tmp);
136 RegReduceOp(ReduceL2Norm, float);
137 
138 // IntReduceSum
139 #define IntReduceSumPreDeal int tmp = 0;
140 #define IntReduceSumMidCalc tmp += inner_src[i * inner_size];
141 #define IntReduceSumPostDeal outer_dst[k] = tmp;
142 RegReduceOp(IntReduceSum, int32_t);
143 
144 // IntReduceMean
145 #define IntReduceMeanPreDeal int tmp = 0;
146 #define IntReduceMeanMidCalc tmp += inner_src[i * inner_size];
147 #define IntReduceMeanPostDeal outer_dst[k] = tmp / axis_size;
148 RegReduceOp(IntReduceMean, int32_t);
149 
150 // IntReduceMin
151 #define IntReduceMinPreDeal int tmp = INT32_MAX;
152 #define IntReduceMinMidCalc tmp = MSMIN(tmp, inner_src[i * inner_size]);
153 #define IntReduceMinPostDeal outer_dst[k] = tmp;
154 RegReduceOp(IntReduceMin, int32_t);
155 
156 // IntReduceMax
157 #define IntReduceMaxPreDeal int tmp = INT32_MIN;
158 #define IntReduceMaxMidCalc tmp = MSMAX(tmp, inner_src[i * inner_size]);
159 #define IntReduceMaxPostDeal outer_dst[k] = tmp;
160 RegReduceOp(IntReduceMax, int32_t);
161 
ReduceAll(int outer_size,int inner_size,int axis_size,const bool * src_data,bool * dst_data,int tid,int thread_num)162 int ReduceAll(int outer_size, int inner_size, int axis_size, const bool *src_data, bool *dst_data, int tid,
163               int thread_num) {
164   if (src_data == NULL || dst_data == NULL) {
165     return NNACL_NULL_PTR;
166   }
167   if (thread_num == 0) {
168     return NNACL_PARAM_INVALID;
169   }
170   int i, j, k;
171   for (j = tid; j < outer_size; j += thread_num) {
172     const bool *outer_src = src_data + j * axis_size * inner_size;
173     bool *outer_dst = dst_data + j * inner_size;
174     for (k = 0; k < inner_size; k++) {
175       const bool *inner_src = outer_src + k;
176       bool *inner_dst = outer_dst + k;
177       bool tmp = true;
178       for (i = 0; i < axis_size; i++) {
179         tmp = tmp && inner_src[i * inner_size];
180       }
181       *inner_dst = tmp;
182     }
183   }
184   return NNACL_OK;
185 }
186 
IntReduceProd(int outer_size,int inner_size,int axis_size,const int32_t * src_data,int32_t * dst_data,int tid,int thread_num)187 int IntReduceProd(int outer_size, int inner_size, int axis_size, const int32_t *src_data, int32_t *dst_data, int tid,
188                   int thread_num) {
189   if (src_data == NULL || dst_data == NULL) {
190     return NNACL_NULL_PTR;
191   }
192   if (thread_num == 0) {
193     return NNACL_PARAM_INVALID;
194   }
195   int i, j, k;
196   for (j = tid; j < outer_size; j += thread_num) {
197     const int32_t *outer_src = src_data + j * axis_size * inner_size;
198     int32_t *outer_dst = dst_data + j * inner_size;
199     for (k = 0; k < inner_size; k++) {
200       const int32_t *inner_src = outer_src + k;
201       int32_t *inner_dst = outer_dst + k;
202       int tmp = 1;
203       for (i = 0; i < axis_size; i++) {
204         if (isMulOverflow(tmp, inner_src[i * inner_size])) {
205           return NNACL_ERRCODE_MUL_OVERFLOW;
206         }
207         tmp *= inner_src[i * inner_size];
208       }
209       *inner_dst = tmp;
210     }
211   }
212   return NNACL_OK;
213 }
214 
215 #ifdef ENABLE_NNACL_INFER_SHAPE
ReduceInferShape(int32_t ** in_shape,size_t * dim_size,int32_t * out_shape,int32_t * in_format,int32_t * out_format,int32_t * in_datatype,int32_t * out_datatype,OpParameter * param)216 int ReduceInferShape(int32_t **in_shape, size_t *dim_size, int32_t *out_shape, int32_t *in_format, int32_t *out_format,
217                      int32_t *in_datatype, int32_t *out_datatype, OpParameter *param) {
218   *out_format = in_format[0];
219   *out_datatype = in_datatype[0];
220   ReduceParameter *reduce_parameter = (ReduceParameter *)param;
221   bool keep_dims = reduce_parameter->keep_dims_;
222   int num_axes = reduce_parameter->num_axes_;
223   int32_t *in_shape0 = in_shape[0];
224   int rank = dim_size[0];
225   NNACL_CHECK_TRUE_RET(rank > 0 && rank <= REDUCE_MAX_AXES_NUM, NNACL_PARAM_INVALID);
226   int axes[REDUCE_MAX_AXES_NUM];
227   int actual_axes_num = num_axes;
228   for (int i = 0; i < num_axes; ++i) {
229     NNACL_CHECK_TRUE_RET(reduce_parameter->axes_[i] >= -rank && reduce_parameter->axes_[i] < rank, NNACL_PARAM_INVALID);
230     if (reduce_parameter->axes_[i] < 0) {
231       axes[i] = reduce_parameter->axes_[i] + rank;
232     } else {
233       axes[i] = reduce_parameter->axes_[i];
234     }
235   }
236   if (reduce_parameter->reduce_to_end_) {
237     NNACL_CHECK_TRUE_RET(num_axes == 1, NNACL_PARAM_INVALID);
238     int begin_axis = axes[0];
239     num_axes = rank - begin_axis;
240     for (int i = begin_axis + 1; i < rank; ++i) {
241       axes[actual_axes_num++] = i;
242     }
243   }
244   if (num_axes == 0) {
245     int j = 0;
246     for (int i = 0; i < rank; ++i) {
247       axes[i] = i;
248       if (keep_dims) {
249         out_shape[j++] = 1;
250       }
251     }
252     reduce_parameter->num_axes_ = rank;
253     for (int i = 0; i < rank; ++i) {
254       reduce_parameter->axes_[i] = axes[i];
255     }
256     return NNACL_OK;
257   }
258   // reduce on selected axes
259   int j = 0;
260   for (int i = 0; i < rank; ++i) {
261     bool reduce_axis = false;
262     for (int idx = 0; idx < num_axes; ++idx) {
263       if (axes[idx] == i) {
264         reduce_axis = true;
265         break;
266       }
267     }
268     if (reduce_axis) {
269       if (keep_dims) {
270         out_shape[j++] = 1;
271       }
272     } else {
273       out_shape[j++] = in_shape0[i];
274     }
275   }
276   reduce_parameter->num_axes_ = num_axes;
277   for (int i = 0; i < num_axes; ++i) {
278     reduce_parameter->axes_[i] = axes[i];
279   }
280   return NNACL_OK;
281 }
282 #endif
283 
284 // [A, B] -> [B]
285 // col_size : start -> end for parallel
ReduceSumDim2Axis0(size_t col_size,size_t col_len,size_t row_len,const float * src_data,float * dst_data)286 int ReduceSumDim2Axis0(size_t col_size, size_t col_len, size_t row_len, const float *src_data, float *dst_data) {
287   if (src_data == NULL || dst_data == NULL) {
288     return NNACL_NULL_PTR;
289   }
290 
291   size_t k = 0;
292   SIMD_RUN_NO_SCALAR(ReduceSumDim2Axis0, k, col_size, col_len, row_len, src_data, dst_data);
293   for (; k < col_size; k++) {
294     const float *inner_src = src_data + k;
295     float *inner_dst = dst_data + k;
296     float tmp = 0.0f;
297     for (size_t i = 0; i < row_len; i++) {
298       tmp += inner_src[i * col_len];
299     }
300     *inner_dst = tmp;
301   }
302   return NNACL_OK;
303 }
304 
305 // [A, B] -> [A]
ReduceSumDim2Axis1(size_t col_len,const float * src_data,float * dst_data)306 int ReduceSumDim2Axis1(size_t col_len, const float *src_data, float *dst_data) {
307   if (src_data == NULL || dst_data == NULL) {
308     return NNACL_NULL_PTR;
309   }
310   size_t k = 0;
311   float tmp = 0;
312 #ifdef ENABLE_AVX
313   size_t block_mod = col_len % C8NUM;
314   size_t block_c8 = col_len - block_mod;
315   float tmp_arr[8] = {0, 0, 0, 0, 0, 0, 0, 0};
316   MS_FLOAT32X8 tmp_arr_8 = MS_MOV256_F32(tmp_arr[0]);
317   for (; k < block_c8; k += C8NUM) {
318     MS_FLOAT32X8 src_in = MS_LD256_F32(src_data + k);
319     tmp_arr_8 = MS_ADD256_F32(tmp_arr_8, src_in);
320   }
321   MS_ST256_F32(tmp_arr, tmp_arr_8);
322   for (size_t i = 0; i < 8; ++i) {
323     tmp += tmp_arr[i];
324   }
325 #endif
326   for (; k < col_len; k++) {
327     tmp += src_data[k];
328   }
329   dst_data[0] = tmp;
330   return NNACL_OK;
331 }
332 
ReduceMeanWithAxis(const float * src_data,float * mean,int64_t size)333 int ReduceMeanWithAxis(const float *src_data, float *mean, int64_t size) {
334   if (size == 0 || src_data == NULL) {
335     return NNACL_NULL_PTR;
336   }
337   float sum = 0.0;
338   int64_t i = 0;
339   SIMD_RUN_NO_SCALAR(ReduceSumByLastAxis, i, src_data, &sum, 0);
340   for (; i < size; ++i) {
341     sum += src_data[i];
342   }
343   *mean = sum / size;
344   return NNACL_OK;
345 }
346 
ReduceDeviation(const float * src_data,int64_t size,float mean,float * deviation)347 int ReduceDeviation(const float *src_data, int64_t size, float mean, float *deviation) {
348   if (size == 0 || src_data == NULL) {
349     return NNACL_NULL_PTR;
350   }
351   int64_t i = 0;
352   SIMD_RUN_NO_SCALAR(FloatReduceDeviation, i, src_data, mean, size, deviation);
353   for (; i < size; ++i) {
354     float tmp = src_data[i] - mean;
355     tmp = tmp * tmp;
356     *deviation += tmp;
357   }
358   return NNACL_OK;
359 }
360