• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "nnacl/kernel/reduce.h"
18 #include <math.h>
19 #include "nnacl/fp32/reduce_fp32.h"
20 #include "nnacl/kernel/reshape.h"
21 #include "nnacl/nnacl_common.h"
22 #include "nnacl/tensor_c_utils.h"
23 #include "nnacl/kernel/default_kernel_base.h"
24 
InitialReduceKernelList(KernelBase * base)25 void InitialReduceKernelList(KernelBase *base) {
26   ReduceStruct *reduce = (ReduceStruct *)base;
27   ReduceParameter *param = (ReduceParameter *)(base->param_);
28 
29   ReduceKernelList func_list[] = {{Reduce_Sum, ReduceSum, IntReduceSum, NULL, ReduceSumByLastAxis},
30                                   {Reduce_Mean, ReduceMean, IntReduceMean, NULL, NULL},
31                                   {Reduce_Max, ReduceMax, IntReduceMax, NULL, ReduceMaxByLastAxis},
32                                   {Reduce_Min, ReduceMin, IntReduceMin, NULL, NULL},
33                                   {Reduce_Prod, ReduceProd, IntReduceProd, NULL, NULL},
34                                   {Reduce_SumSquare, ReduceSum, IntReduceSum, NULL, NULL},
35                                   {Reduce_ASum, ReduceSum, IntReduceSum, NULL, NULL},
36                                   {Reduce_All, NULL, NULL, ReduceAll, NULL},
37                                   {Reduce_L2, ReduceL2Norm, NULL, NULL, NULL}};
38 
39   size_t list_len = sizeof(func_list) / sizeof(ReduceKernelList);
40   for (size_t i = 0; i < list_len; ++i) {
41     if (param->mode_ == func_list[i].type_) {
42       reduce->compute_ = func_list[i];
43       return;
44     }
45   }
46 }
47 
CallReduceUnit(KernelBase * base,int task_id)48 int CallReduceUnit(KernelBase *base, int task_id) {
49   ReduceStruct *reduce = (ReduceStruct *)base;
50   NNACL_CHECK_NULL_RETURN_ERR(reduce->src_data_);
51   NNACL_CHECK_NULL_RETURN_ERR(reduce->dst_data_);
52 
53   if (reduce->data_type_ == kNumberTypeFloat32) {
54     if (reduce->inner_size_ == 1 && reduce->compute_.float_last_axis_func_ != NULL) {
55       return reduce->compute_.float_last_axis_func_(reduce->outer_size_, reduce->inner_size_, reduce->axis_size_,
56                                                     (float *)(reduce->src_data_), (float *)(reduce->dst_data_), task_id,
57                                                     reduce->base_.thread_nr_);
58     } else {
59       NNACL_CHECK_NULL_RETURN_ERR(reduce->compute_.float_function_);
60       return reduce->compute_.float_function_(reduce->outer_size_, reduce->inner_size_, reduce->axis_size_,
61                                               (float *)(reduce->src_data_), (float *)(reduce->dst_data_), task_id,
62                                               reduce->base_.thread_nr_);
63     }
64   }
65 
66   if (reduce->data_type_ == kNumberTypeBool) {
67     NNACL_CHECK_NULL_RETURN_ERR(reduce->compute_.bool_function_);
68     return reduce->compute_.bool_function_(reduce->outer_size_, reduce->inner_size_, reduce->axis_size_,
69                                            (bool *)(reduce->src_data_), (bool *)(reduce->dst_data_), task_id,
70                                            reduce->base_.thread_nr_);
71   }
72 
73   if (reduce->data_type_ == kNumberTypeInt32) {
74     NNACL_CHECK_NULL_RETURN_ERR(reduce->compute_.int_function_);
75     return reduce->compute_.int_function_(reduce->outer_size_, reduce->inner_size_, reduce->axis_size_,
76                                           (int *)(reduce->src_data_), (int *)(reduce->dst_data_), task_id,
77                                           reduce->base_.thread_nr_);
78   }
79 
80   return NNACL_REDUCE_UNSUPPORTED_DATA_TYPE;
81 }
82 
ReduceImpl(void * cdata,int task_id,float l,float r)83 int ReduceImpl(void *cdata, int task_id, float l, float r) {
84   NNACL_CHECK_NULL_RETURN_ERR(cdata);
85   ReduceStruct *reduce = (ReduceStruct *)cdata;
86   return reduce->call_uint_((KernelBase *)reduce, task_id);
87 }
88 
CopyReduceyInputToOutput(ReduceStruct * reduce)89 int CopyReduceyInputToOutput(ReduceStruct *reduce) {
90   int total_num = GetElementNum(reduce->base_.in_[FIRST_INPUT]);
91   NNACL_CHECK_FALSE(total_num == 0, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
92   int block_num = UP_DIV(total_num, reduce->base_.thread_nr_);
93   int tmp_thread_num = UP_DIV(total_num, block_num);
94   NNACL_CHECK_FALSE(tmp_thread_num == 0, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
95 
96   ReshapeStruct reshape_struct;
97   reshape_struct.base_.in_ = reduce->base_.in_;
98   reshape_struct.base_.out_ = reduce->base_.out_;
99   reshape_struct.block_num_ = block_num;
100   reshape_struct.total_num_ = total_num;
101   reshape_struct.base_.thread_nr_ = tmp_thread_num;
102   return reduce->base_.env_->ParallelLaunch(reduce->base_.env_->thread_pool_, ParallelReshape, &reshape_struct,
103                                             tmp_thread_num);
104 }
105 
MallocReduceTmpBuffer(ReduceStruct * reduce)106 int MallocReduceTmpBuffer(ReduceStruct *reduce) {
107   // Clean pointers in data_buffer for free condition checking in FreeReduceTmpBuffer.
108   memset(reduce->data_buffers_, 0, reduce->data_buffers_size_ * sizeof(void *));
109 
110   for (int i = 0; i < reduce->data_buffers_size_; i++) {
111     reduce->data_buffers_[i] = reduce->base_.env_->Alloc(
112       reduce->base_.env_->allocator_, reduce->data_buffer_sizes_[i] * DataTypeCSize(reduce->data_type_));
113     NNACL_MALLOC_CHECK_NULL_RETURN_ERR(reduce->data_buffers_[i]);
114   }
115   return NNACL_OK;
116 }
117 
FreeReduceTmpBuffer(ReduceStruct * reduce)118 void FreeReduceTmpBuffer(ReduceStruct *reduce) {
119   for (int i = 0; i < reduce->data_buffers_size_; i++) {
120     if (reduce->data_buffers_[i] != NULL) {
121       reduce->base_.env_->Free(reduce->base_.env_->allocator_, reduce->data_buffers_[i]);
122     }
123     reduce->data_buffers_[i] = NULL;
124   }
125 }
126 
CalculateReduceCoeffOutput(KernelBase * base)127 int CalculateReduceCoeffOutput(KernelBase *base) {
128   ReduceStruct *reduce = (ReduceStruct *)base;
129 
130   if (reduce->data_type_ != kNumberTypeFloat32) {
131     return NNACL_REDUCE_COEFF_DATA_TYPE_INVALID;
132   }
133   TensorC *out_tensor = reduce->base_.out_[OUTPUT_INDEX];
134   NNACL_CHECK_NULL_RETURN_ERR(out_tensor);
135   NNACL_CHECK_NULL_RETURN_ERR(out_tensor->data_);
136   int num = GetElementNum(out_tensor);
137 
138   float *out_data = (float *)out_tensor->data_;
139   for (int i = 0; i < num; ++i) {
140     out_data[i] *= ((ReduceParameter *)reduce->base_.param_)->coeff;
141   }
142   return NNACL_OK;
143 }
144 
HandleReduceASumAndSumSquare(KernelBase * base)145 void HandleReduceASumAndSumSquare(KernelBase *base) {
146   ReduceStruct *reduce = (ReduceStruct *)base;
147   if (reduce->data_type_ == kNumberTypeInt32 || reduce->data_type_ == kNumberTypeBool) {
148     return;
149   }
150 
151   TensorC *in_tensor = base->in_[FIRST_INPUT];
152   NNACL_CHECK_NULL_RETURN_VOID(in_tensor);
153   float *data = (float *)in_tensor->data_;
154   NNACL_CHECK_NULL_RETURN_VOID(data);
155 
156   int num = GetElementNum(in_tensor);
157 
158   if (((ReduceParameter *)base->param_)->mode_ == Reduce_ASum) {
159     for (int i = 0; i < num; ++i) {
160       if (data[i] < 0.0f) {
161         data[i] = 0.0f - data[i];
162       }
163     }
164   }
165 
166   if (((ReduceParameter *)base->param_)->mode_ == Reduce_SumSquare) {
167     for (int i = 0; i < num; ++i) {
168       data[i] = data[i] * data[i];
169     }
170     return;
171   }
172 }
173 
ReduceCheckInputsOutputs(ReduceStruct * reduce)174 int ReduceCheckInputsOutputs(ReduceStruct *reduce) {
175   NNACL_CHECK_FALSE(reduce->base_.in_size_ < ONE_TENSOR, NNACL_INPUT_TENSOR_ERROR);
176   NNACL_CHECK_FALSE(reduce->base_.out_size_ < ONE_TENSOR, NNACL_OUTPUT_TENSOR_ERROR);
177 
178   for (size_t i = 0; i < reduce->base_.in_size_; i++) {
179     NNACL_CHECK_NULL_RETURN_ERR(reduce->base_.in_[i]);
180   }
181   for (size_t i = 0; i < reduce->base_.out_size_; i++) {
182     NNACL_CHECK_NULL_RETURN_ERR(reduce->base_.out_[i]);
183   }
184   TensorC *input_tensor = reduce->base_.in_[FIRST_INPUT];
185   NNACL_CHECK_NULL_RETURN_ERR(input_tensor);
186   if (reduce->base_.in_size_ > ONE_TENSOR) {
187     TensorC *axes_tensor = reduce->base_.in_[SECOND_INPUT];
188     NNACL_CHECK_NULL_RETURN_ERR(axes_tensor);
189     NNACL_CHECK_FALSE(axes_tensor->data_type_ != kNumberTypeInt && axes_tensor->data_type_ != kNumberTypeInt32 &&
190                         axes_tensor->data_type_ != kNumberTypeInt64,
191                       NNACL_REDUCE_AXES_TENSOR_ERROR);
192   }
193   return NNACL_OK;
194 }
195 
ReduceCommonPrepare(ReduceStruct * reduce)196 int ReduceCommonPrepare(ReduceStruct *reduce) {
197   int ret = ReduceCheckInputsOutputs(reduce);
198   if (ret != NNACL_OK) {
199     return ret;
200   }
201 
202   if (reduce->base_.in_size_ == ONE_TENSOR) {
203     reduce->num_axes_ = 0;
204     return NNACL_OK;
205   }
206 
207   TensorC *axes_tensor = reduce->base_.in_[SECOND_INPUT];
208   reduce->num_axes_ = GetElementNum(axes_tensor);
209   if (axes_tensor->data_ != NULL && (reduce->num_axes_ <= 0 || reduce->num_axes_ > MAX_SHAPE_SIZE)) {
210     return NNACL_REDUCE_AXES_TENSOR_ERROR;
211   }
212   if (axes_tensor->data_ == NULL) {
213     reduce->num_axes_ = reduce->base_.in_[FIRST_INPUT]->shape_size_;
214     for (int i = 0; i < reduce->num_axes_; i++) {
215       reduce->axes_[i] = i;
216     }
217   } else {
218     if (axes_tensor->data_type_ == kNumberTypeInt32 || axes_tensor->data_type_ == kNumberTypeInt) {
219       NNACL_CHECK_FALSE(GetSize(axes_tensor) == 0, NNACL_REDUCE_AXES_TENSOR_ERROR);
220       (void)memcpy(reduce->axes_, axes_tensor->data_, GetSize(axes_tensor));
221     } else {
222       int64_t *axes_data = axes_tensor->data_;
223       for (size_t i = 0; i < reduce->num_axes_; i++) {
224         reduce->axes_[i] = (int32_t)axes_data[i];
225       }
226     }
227   }
228 
229   return NNACL_OK;
230 }
231 
CheckReduceParameters(ReduceStruct * reduce)232 int CheckReduceParameters(ReduceStruct *reduce) {
233   int input_shape_size = reduce->base_.in_[FIRST_INPUT]->shape_size_;
234   NNACL_CHECK_FALSE(reduce->num_axes_ > input_shape_size, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
235 
236   for (int i = 0; i < reduce->num_axes_; i++) {
237     NNACL_CHECK_FALSE(reduce->axes_[i] < -input_shape_size, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
238     NNACL_CHECK_FALSE(reduce->axes_[i] >= input_shape_size, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
239 
240     if (reduce->axes_[i] < 0) {
241       reduce->axes_[i] += input_shape_size;
242     }
243   }
244 
245   if (((ReduceParameter *)reduce->base_.param_)->reduce_to_end_) {
246     // actual num of axes to reduce
247     reduce->num_axes_ = (int)(input_shape_size)-reduce->axes_[0];
248     for (int i = 1; i < reduce->num_axes_; ++i) {
249       reduce->axes_[i] = reduce->axes_[0] + i;
250     }
251   }
252 
253   if (reduce->num_axes_ == 0) {
254     for (int i = 0; i < input_shape_size; i++) {
255       reduce->axes_[i] = i;
256     }
257     reduce->num_axes_ = input_shape_size;
258   }
259   return NNACL_OK;
260 }
261 
ReduceCalculateInnerOuterSize(ReduceStruct * reduce)262 void ReduceCalculateInnerOuterSize(ReduceStruct *reduce) {
263   TensorC *input_tensor = reduce->base_.in_[FIRST_INPUT];
264   int tmp_input_shape[MAX_SHAPE_SIZE];
265   memcpy(tmp_input_shape, input_tensor->shape_, MAX_SHAPE_SIZE * sizeof(int));
266   reduce->offset_size_ = 0;
267 
268   for (int i = 0; i < reduce->num_axes_; ++i) {
269     int axis = reduce->axes_[i];
270     int outer_size = 1;
271     for (int j = 0; j < axis; j++) {
272       outer_size *= tmp_input_shape[j];
273     }
274     reduce->outer_sizes_[reduce->offset_size_] = outer_size;
275 
276     int inner_size = 1;
277     for (int k = axis + 1; k < input_tensor->shape_size_; k++) {
278       inner_size *= tmp_input_shape[k];
279     }
280     reduce->inner_sizes_[reduce->offset_size_] = inner_size;
281     reduce->axis_sizes_[reduce->offset_size_] = tmp_input_shape[axis];
282 
283     reduce->offset_size_++;
284     tmp_input_shape[axis] = 1;
285   }
286 }
287 
ReduceCalculateTmpBufferSize(ReduceStruct * reduce)288 void ReduceCalculateTmpBufferSize(ReduceStruct *reduce) {
289   reduce->data_buffers_size_ = 0;
290 
291   TensorC *input_tensor = reduce->base_.in_[FIRST_INPUT];
292   int tmp_input_shape[MAX_SHAPE_SIZE];
293   memcpy(tmp_input_shape, input_tensor->shape_, MAX_SHAPE_SIZE * sizeof(int));
294   // calculate size of buffer to malloc for each reducing axis
295   for (int i = 0; i < reduce->num_axes_ - 1; i++) {
296     int axis = reduce->axes_[i];
297     size_t size = 1;
298     for (size_t j = 0; j < input_tensor->shape_size_; j++) {
299       if (axis != (int)(j)) {
300         size *= (size_t)(tmp_input_shape[j]);
301       }
302     }
303     reduce->data_buffer_sizes_[reduce->data_buffers_size_++] = size;
304     tmp_input_shape[axis] = 1;
305   }
306 }
307 
ReduceDecideIfOnlyCopy(ReduceStruct * reduce)308 void ReduceDecideIfOnlyCopy(ReduceStruct *reduce) {
309   ReduceModeC can_not_copy[] = {Reduce_SumSquare, Reduce_ASum, Reduce_All, Reduce_L2};
310   for (int i = 0; i < sizeof(can_not_copy) / sizeof(ReduceModeC); i++) {
311     if (can_not_copy[i] == ((ReduceParameter *)reduce->base_.param_)->mode_) {
312       reduce->only_copy_ = false;
313       return;
314     }
315   }
316 
317   int *in_shape = reduce->base_.in_[FIRST_INPUT]->shape_;
318 
319   for (int i = 0; i < reduce->num_axes_; i++) {
320     int axis = reduce->axes_[i];
321     if (in_shape[axis] != 1) {
322       reduce->only_copy_ = false;
323       return;
324     }
325   }
326   reduce->only_copy_ = true;
327   return;
328 }
329 
ReducePrepare(struct KernelBase * self)330 int ReducePrepare(struct KernelBase *self) {
331   NNACL_CHECK_NULL_RETURN_ERR(self);
332   ReduceStruct *reduce = (ReduceStruct *)self;
333 
334   NNACL_CHECK_FALSE(self->in_size_ < ONE_TENSOR, ONE_TENSOR);
335   NNACL_CHECK_FALSE(self->out_size_ < ONE_TENSOR, ONE_TENSOR);
336 
337   int ret = ReduceCommonPrepare(reduce);
338   if (ret != NNACL_OK) {
339     return ret;
340   }
341 
342   reduce->init_kernel_list_(self);
343   return NNACL_OK;
344 }
345 
ReduceResize(struct KernelBase * self)346 int ReduceResize(struct KernelBase *self) {
347   NNACL_CHECK_NULL_RETURN_ERR(self);
348   ReduceStruct *reduce = (ReduceStruct *)self;
349 
350   int ret = CheckReduceParameters(reduce);
351   if (ret != NNACL_OK) {
352     return ret;
353   }
354 
355   ReduceDecideIfOnlyCopy(reduce);
356   ReduceCalculateTmpBufferSize(reduce);
357   ReduceCalculateInnerOuterSize(reduce);
358 
359   if (reduce->num_axes_ == 1) {
360     self->thread_nr_ = self->UpdateThread(
361       TC_TYPE(PrimType_ReduceFusion, ((ReduceParameter *)reduce->base_.param_)->mode_),
362       reduce->inner_sizes_[Index0] * reduce->axis_sizes_[Index0],
363       reduce->inner_sizes_[Index0] * reduce->axis_sizes_[Index0], reduce->outer_sizes_[Index0], self->thread_nr_);
364   } else {
365     self->thread_nr_ = self->UpdateThread(TC_TYPE(PrimType_ReduceFusion, Reduce_Max + 1), 0, 0,
366                                           GetElementNum(self->out_[OUTPUT_INDEX]), self->thread_nr_);
367   }
368   return NNACL_OK;
369 }
370 
ReduceCompute(struct KernelBase * self)371 int ReduceCompute(struct KernelBase *self) {
372   NNACL_CHECK_NULL_RETURN_ERR(self);
373   ReduceStruct *reduce = (ReduceStruct *)self;
374   NNACL_CHECK_FALSE(self->in_[FIRST_INPUT]->data_type_ != reduce->data_type_, NNACL_ERR);
375 
376   if (reduce->only_copy_) {
377     return CopyReduceyInputToOutput(reduce);
378   }
379 
380   int ret = MallocReduceTmpBuffer(reduce);
381   if (ret != NNACL_OK) {
382     FreeReduceTmpBuffer(reduce);
383     return ret;
384   }
385 
386   reduce->src_data_ = self->in_[FIRST_INPUT]->data_;
387   reduce->handle_sum_square_(self);
388   for (int i = 0; i < reduce->num_axes_; i++) {
389     if (i != (reduce->num_axes_ - 1)) {
390       reduce->dst_data_ = reduce->data_buffers_[i];
391     } else {
392       reduce->dst_data_ = self->out_[FIRST_INPUT]->data_;
393     }
394     reduce->outer_size_ = reduce->outer_sizes_[i];
395     reduce->inner_size_ = reduce->inner_sizes_[i];
396     reduce->axis_size_ = reduce->axis_sizes_[i];
397     NNACL_CHECK_FALSE(reduce->axis_size_ == 0, NNACL_REDUCE_AXIS_SIZE_ERROR);
398 
399     ret = self->env_->ParallelLaunch(self->env_->thread_pool_, ReduceImpl, self, self->thread_nr_);
400     if (ret != NNACL_OK) {
401       FreeReduceTmpBuffer(reduce);
402       return ret;
403     }
404     reduce->src_data_ = reduce->dst_data_;
405   }
406 
407   ReduceParameter *param = (ReduceParameter *)reduce->base_.param_;
408   if (param->reduce_to_end_ && fabsf(param->coeff) > 1e-5) {
409     ret = reduce->calculate_coeff_(self);
410   }
411 
412   FreeReduceTmpBuffer(reduce);
413   return ret;
414 }
415 
CreateReduce(OpParameter * param,int data_type)416 KernelBase *CreateReduce(OpParameter *param, int data_type) {
417   ReduceStruct *reduce = (ReduceStruct *)malloc(sizeof(ReduceStruct));
418   NNACL_MALLOC_CHECK_NULL_RETURN_NULL(reduce);
419   memset(reduce, 0, sizeof(ReduceStruct));
420   reduce->data_type_ = data_type;
421   reduce->base_.Release = DefaultRelease;
422   reduce->base_.Prepare = ReducePrepare;
423   reduce->base_.Resize = ReduceResize;
424   reduce->base_.Compute = ReduceCompute;
425   reduce->handle_sum_square_ = HandleReduceASumAndSumSquare;
426   reduce->calculate_coeff_ = CalculateReduceCoeffOutput;
427   reduce->init_kernel_list_ = InitialReduceKernelList;
428   reduce->call_uint_ = CallReduceUnit;
429   return (KernelBase *)reduce;
430 }
431 
432 REG_KERNEL_CREATOR(PrimType_ReduceFusion, kNumberTypeBool, CreateReduce)
433 REG_KERNEL_CREATOR(PrimType_ReduceFusion, kNumberTypeInt32, CreateReduce)
434 REG_KERNEL_CREATOR(PrimType_ReduceFusion, kNumberTypeFloat32, CreateReduce)
435