1 /**
2 * Copyright 2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "nnacl/kernel/reduce.h"
18 #include <math.h>
19 #include "nnacl/fp32/reduce_fp32.h"
20 #include "nnacl/kernel/reshape.h"
21 #include "nnacl/nnacl_common.h"
22 #include "nnacl/tensor_c_utils.h"
23 #include "nnacl/kernel/default_kernel_base.h"
24
InitialReduceKernelList(KernelBase * base)25 void InitialReduceKernelList(KernelBase *base) {
26 ReduceStruct *reduce = (ReduceStruct *)base;
27 ReduceParameter *param = (ReduceParameter *)(base->param_);
28
29 ReduceKernelList func_list[] = {{Reduce_Sum, ReduceSum, IntReduceSum, NULL, ReduceSumByLastAxis},
30 {Reduce_Mean, ReduceMean, IntReduceMean, NULL, NULL},
31 {Reduce_Max, ReduceMax, IntReduceMax, NULL, ReduceMaxByLastAxis},
32 {Reduce_Min, ReduceMin, IntReduceMin, NULL, NULL},
33 {Reduce_Prod, ReduceProd, IntReduceProd, NULL, NULL},
34 {Reduce_SumSquare, ReduceSum, IntReduceSum, NULL, NULL},
35 {Reduce_ASum, ReduceSum, IntReduceSum, NULL, NULL},
36 {Reduce_All, NULL, NULL, ReduceAll, NULL},
37 {Reduce_L2, ReduceL2Norm, NULL, NULL, NULL}};
38
39 size_t list_len = sizeof(func_list) / sizeof(ReduceKernelList);
40 for (size_t i = 0; i < list_len; ++i) {
41 if (param->mode_ == func_list[i].type_) {
42 reduce->compute_ = func_list[i];
43 return;
44 }
45 }
46 }
47
CallReduceUnit(KernelBase * base,int task_id)48 int CallReduceUnit(KernelBase *base, int task_id) {
49 ReduceStruct *reduce = (ReduceStruct *)base;
50 NNACL_CHECK_NULL_RETURN_ERR(reduce->src_data_);
51 NNACL_CHECK_NULL_RETURN_ERR(reduce->dst_data_);
52
53 if (reduce->data_type_ == kNumberTypeFloat32) {
54 if (reduce->inner_size_ == 1 && reduce->compute_.float_last_axis_func_ != NULL) {
55 return reduce->compute_.float_last_axis_func_(reduce->outer_size_, reduce->inner_size_, reduce->axis_size_,
56 (float *)(reduce->src_data_), (float *)(reduce->dst_data_), task_id,
57 reduce->base_.thread_nr_);
58 } else {
59 NNACL_CHECK_NULL_RETURN_ERR(reduce->compute_.float_function_);
60 return reduce->compute_.float_function_(reduce->outer_size_, reduce->inner_size_, reduce->axis_size_,
61 (float *)(reduce->src_data_), (float *)(reduce->dst_data_), task_id,
62 reduce->base_.thread_nr_);
63 }
64 }
65
66 if (reduce->data_type_ == kNumberTypeBool) {
67 NNACL_CHECK_NULL_RETURN_ERR(reduce->compute_.bool_function_);
68 return reduce->compute_.bool_function_(reduce->outer_size_, reduce->inner_size_, reduce->axis_size_,
69 (bool *)(reduce->src_data_), (bool *)(reduce->dst_data_), task_id,
70 reduce->base_.thread_nr_);
71 }
72
73 if (reduce->data_type_ == kNumberTypeInt32) {
74 NNACL_CHECK_NULL_RETURN_ERR(reduce->compute_.int_function_);
75 return reduce->compute_.int_function_(reduce->outer_size_, reduce->inner_size_, reduce->axis_size_,
76 (int *)(reduce->src_data_), (int *)(reduce->dst_data_), task_id,
77 reduce->base_.thread_nr_);
78 }
79
80 return NNACL_REDUCE_UNSUPPORTED_DATA_TYPE;
81 }
82
ReduceImpl(void * cdata,int task_id,float l,float r)83 int ReduceImpl(void *cdata, int task_id, float l, float r) {
84 NNACL_CHECK_NULL_RETURN_ERR(cdata);
85 ReduceStruct *reduce = (ReduceStruct *)cdata;
86 return reduce->call_uint_((KernelBase *)reduce, task_id);
87 }
88
CopyReduceyInputToOutput(ReduceStruct * reduce)89 int CopyReduceyInputToOutput(ReduceStruct *reduce) {
90 int total_num = GetElementNum(reduce->base_.in_[FIRST_INPUT]);
91 NNACL_CHECK_FALSE(total_num == 0, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
92 int block_num = UP_DIV(total_num, reduce->base_.thread_nr_);
93 int tmp_thread_num = UP_DIV(total_num, block_num);
94 NNACL_CHECK_FALSE(tmp_thread_num == 0, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
95
96 ReshapeStruct reshape_struct;
97 reshape_struct.base_.in_ = reduce->base_.in_;
98 reshape_struct.base_.out_ = reduce->base_.out_;
99 reshape_struct.block_num_ = block_num;
100 reshape_struct.total_num_ = total_num;
101 reshape_struct.base_.thread_nr_ = tmp_thread_num;
102 return reduce->base_.env_->ParallelLaunch(reduce->base_.env_->thread_pool_, ParallelReshape, &reshape_struct,
103 tmp_thread_num);
104 }
105
MallocReduceTmpBuffer(ReduceStruct * reduce)106 int MallocReduceTmpBuffer(ReduceStruct *reduce) {
107 // Clean pointers in data_buffer for free condition checking in FreeReduceTmpBuffer.
108 memset(reduce->data_buffers_, 0, reduce->data_buffers_size_ * sizeof(void *));
109
110 for (int i = 0; i < reduce->data_buffers_size_; i++) {
111 reduce->data_buffers_[i] = reduce->base_.env_->Alloc(
112 reduce->base_.env_->allocator_, reduce->data_buffer_sizes_[i] * DataTypeCSize(reduce->data_type_));
113 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(reduce->data_buffers_[i]);
114 }
115 return NNACL_OK;
116 }
117
FreeReduceTmpBuffer(ReduceStruct * reduce)118 void FreeReduceTmpBuffer(ReduceStruct *reduce) {
119 for (int i = 0; i < reduce->data_buffers_size_; i++) {
120 if (reduce->data_buffers_[i] != NULL) {
121 reduce->base_.env_->Free(reduce->base_.env_->allocator_, reduce->data_buffers_[i]);
122 }
123 reduce->data_buffers_[i] = NULL;
124 }
125 }
126
CalculateReduceCoeffOutput(KernelBase * base)127 int CalculateReduceCoeffOutput(KernelBase *base) {
128 ReduceStruct *reduce = (ReduceStruct *)base;
129
130 if (reduce->data_type_ != kNumberTypeFloat32) {
131 return NNACL_REDUCE_COEFF_DATA_TYPE_INVALID;
132 }
133 TensorC *out_tensor = reduce->base_.out_[OUTPUT_INDEX];
134 NNACL_CHECK_NULL_RETURN_ERR(out_tensor);
135 NNACL_CHECK_NULL_RETURN_ERR(out_tensor->data_);
136 int num = GetElementNum(out_tensor);
137
138 float *out_data = (float *)out_tensor->data_;
139 for (int i = 0; i < num; ++i) {
140 out_data[i] *= ((ReduceParameter *)reduce->base_.param_)->coeff;
141 }
142 return NNACL_OK;
143 }
144
HandleReduceASumAndSumSquare(KernelBase * base)145 void HandleReduceASumAndSumSquare(KernelBase *base) {
146 ReduceStruct *reduce = (ReduceStruct *)base;
147 if (reduce->data_type_ == kNumberTypeInt32 || reduce->data_type_ == kNumberTypeBool) {
148 return;
149 }
150
151 TensorC *in_tensor = base->in_[FIRST_INPUT];
152 NNACL_CHECK_NULL_RETURN_VOID(in_tensor);
153 float *data = (float *)in_tensor->data_;
154 NNACL_CHECK_NULL_RETURN_VOID(data);
155
156 int num = GetElementNum(in_tensor);
157
158 if (((ReduceParameter *)base->param_)->mode_ == Reduce_ASum) {
159 for (int i = 0; i < num; ++i) {
160 if (data[i] < 0.0f) {
161 data[i] = 0.0f - data[i];
162 }
163 }
164 }
165
166 if (((ReduceParameter *)base->param_)->mode_ == Reduce_SumSquare) {
167 for (int i = 0; i < num; ++i) {
168 data[i] = data[i] * data[i];
169 }
170 return;
171 }
172 }
173
ReduceCheckInputsOutputs(ReduceStruct * reduce)174 int ReduceCheckInputsOutputs(ReduceStruct *reduce) {
175 NNACL_CHECK_FALSE(reduce->base_.in_size_ < ONE_TENSOR, NNACL_INPUT_TENSOR_ERROR);
176 NNACL_CHECK_FALSE(reduce->base_.out_size_ < ONE_TENSOR, NNACL_OUTPUT_TENSOR_ERROR);
177
178 for (size_t i = 0; i < reduce->base_.in_size_; i++) {
179 NNACL_CHECK_NULL_RETURN_ERR(reduce->base_.in_[i]);
180 }
181 for (size_t i = 0; i < reduce->base_.out_size_; i++) {
182 NNACL_CHECK_NULL_RETURN_ERR(reduce->base_.out_[i]);
183 }
184 TensorC *input_tensor = reduce->base_.in_[FIRST_INPUT];
185 NNACL_CHECK_NULL_RETURN_ERR(input_tensor);
186 if (reduce->base_.in_size_ > ONE_TENSOR) {
187 TensorC *axes_tensor = reduce->base_.in_[SECOND_INPUT];
188 NNACL_CHECK_NULL_RETURN_ERR(axes_tensor);
189 NNACL_CHECK_FALSE(axes_tensor->data_type_ != kNumberTypeInt && axes_tensor->data_type_ != kNumberTypeInt32 &&
190 axes_tensor->data_type_ != kNumberTypeInt64,
191 NNACL_REDUCE_AXES_TENSOR_ERROR);
192 }
193 return NNACL_OK;
194 }
195
ReduceCommonPrepare(ReduceStruct * reduce)196 int ReduceCommonPrepare(ReduceStruct *reduce) {
197 int ret = ReduceCheckInputsOutputs(reduce);
198 if (ret != NNACL_OK) {
199 return ret;
200 }
201
202 if (reduce->base_.in_size_ == ONE_TENSOR) {
203 reduce->num_axes_ = 0;
204 return NNACL_OK;
205 }
206
207 TensorC *axes_tensor = reduce->base_.in_[SECOND_INPUT];
208 reduce->num_axes_ = GetElementNum(axes_tensor);
209 if (axes_tensor->data_ != NULL && (reduce->num_axes_ <= 0 || reduce->num_axes_ > MAX_SHAPE_SIZE)) {
210 return NNACL_REDUCE_AXES_TENSOR_ERROR;
211 }
212 if (axes_tensor->data_ == NULL) {
213 reduce->num_axes_ = reduce->base_.in_[FIRST_INPUT]->shape_size_;
214 for (int i = 0; i < reduce->num_axes_; i++) {
215 reduce->axes_[i] = i;
216 }
217 } else {
218 if (axes_tensor->data_type_ == kNumberTypeInt32 || axes_tensor->data_type_ == kNumberTypeInt) {
219 NNACL_CHECK_FALSE(GetSize(axes_tensor) == 0, NNACL_REDUCE_AXES_TENSOR_ERROR);
220 (void)memcpy(reduce->axes_, axes_tensor->data_, GetSize(axes_tensor));
221 } else {
222 int64_t *axes_data = axes_tensor->data_;
223 for (size_t i = 0; i < reduce->num_axes_; i++) {
224 reduce->axes_[i] = (int32_t)axes_data[i];
225 }
226 }
227 }
228
229 return NNACL_OK;
230 }
231
CheckReduceParameters(ReduceStruct * reduce)232 int CheckReduceParameters(ReduceStruct *reduce) {
233 int input_shape_size = reduce->base_.in_[FIRST_INPUT]->shape_size_;
234 NNACL_CHECK_FALSE(reduce->num_axes_ > input_shape_size, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
235
236 for (int i = 0; i < reduce->num_axes_; i++) {
237 NNACL_CHECK_FALSE(reduce->axes_[i] < -input_shape_size, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
238 NNACL_CHECK_FALSE(reduce->axes_[i] >= input_shape_size, NNACL_REDUCE_INPUT_SHAPE_SIZE_INVALID);
239
240 if (reduce->axes_[i] < 0) {
241 reduce->axes_[i] += input_shape_size;
242 }
243 }
244
245 if (((ReduceParameter *)reduce->base_.param_)->reduce_to_end_) {
246 // actual num of axes to reduce
247 reduce->num_axes_ = (int)(input_shape_size)-reduce->axes_[0];
248 for (int i = 1; i < reduce->num_axes_; ++i) {
249 reduce->axes_[i] = reduce->axes_[0] + i;
250 }
251 }
252
253 if (reduce->num_axes_ == 0) {
254 for (int i = 0; i < input_shape_size; i++) {
255 reduce->axes_[i] = i;
256 }
257 reduce->num_axes_ = input_shape_size;
258 }
259 return NNACL_OK;
260 }
261
ReduceCalculateInnerOuterSize(ReduceStruct * reduce)262 void ReduceCalculateInnerOuterSize(ReduceStruct *reduce) {
263 TensorC *input_tensor = reduce->base_.in_[FIRST_INPUT];
264 int tmp_input_shape[MAX_SHAPE_SIZE];
265 memcpy(tmp_input_shape, input_tensor->shape_, MAX_SHAPE_SIZE * sizeof(int));
266 reduce->offset_size_ = 0;
267
268 for (int i = 0; i < reduce->num_axes_; ++i) {
269 int axis = reduce->axes_[i];
270 int outer_size = 1;
271 for (int j = 0; j < axis; j++) {
272 outer_size *= tmp_input_shape[j];
273 }
274 reduce->outer_sizes_[reduce->offset_size_] = outer_size;
275
276 int inner_size = 1;
277 for (int k = axis + 1; k < input_tensor->shape_size_; k++) {
278 inner_size *= tmp_input_shape[k];
279 }
280 reduce->inner_sizes_[reduce->offset_size_] = inner_size;
281 reduce->axis_sizes_[reduce->offset_size_] = tmp_input_shape[axis];
282
283 reduce->offset_size_++;
284 tmp_input_shape[axis] = 1;
285 }
286 }
287
ReduceCalculateTmpBufferSize(ReduceStruct * reduce)288 void ReduceCalculateTmpBufferSize(ReduceStruct *reduce) {
289 reduce->data_buffers_size_ = 0;
290
291 TensorC *input_tensor = reduce->base_.in_[FIRST_INPUT];
292 int tmp_input_shape[MAX_SHAPE_SIZE];
293 memcpy(tmp_input_shape, input_tensor->shape_, MAX_SHAPE_SIZE * sizeof(int));
294 // calculate size of buffer to malloc for each reducing axis
295 for (int i = 0; i < reduce->num_axes_ - 1; i++) {
296 int axis = reduce->axes_[i];
297 size_t size = 1;
298 for (size_t j = 0; j < input_tensor->shape_size_; j++) {
299 if (axis != (int)(j)) {
300 size *= (size_t)(tmp_input_shape[j]);
301 }
302 }
303 reduce->data_buffer_sizes_[reduce->data_buffers_size_++] = size;
304 tmp_input_shape[axis] = 1;
305 }
306 }
307
ReduceDecideIfOnlyCopy(ReduceStruct * reduce)308 void ReduceDecideIfOnlyCopy(ReduceStruct *reduce) {
309 ReduceModeC can_not_copy[] = {Reduce_SumSquare, Reduce_ASum, Reduce_All, Reduce_L2};
310 for (int i = 0; i < sizeof(can_not_copy) / sizeof(ReduceModeC); i++) {
311 if (can_not_copy[i] == ((ReduceParameter *)reduce->base_.param_)->mode_) {
312 reduce->only_copy_ = false;
313 return;
314 }
315 }
316
317 int *in_shape = reduce->base_.in_[FIRST_INPUT]->shape_;
318
319 for (int i = 0; i < reduce->num_axes_; i++) {
320 int axis = reduce->axes_[i];
321 if (in_shape[axis] != 1) {
322 reduce->only_copy_ = false;
323 return;
324 }
325 }
326 reduce->only_copy_ = true;
327 return;
328 }
329
ReducePrepare(struct KernelBase * self)330 int ReducePrepare(struct KernelBase *self) {
331 NNACL_CHECK_NULL_RETURN_ERR(self);
332 ReduceStruct *reduce = (ReduceStruct *)self;
333
334 NNACL_CHECK_FALSE(self->in_size_ < ONE_TENSOR, ONE_TENSOR);
335 NNACL_CHECK_FALSE(self->out_size_ < ONE_TENSOR, ONE_TENSOR);
336
337 int ret = ReduceCommonPrepare(reduce);
338 if (ret != NNACL_OK) {
339 return ret;
340 }
341
342 reduce->init_kernel_list_(self);
343 return NNACL_OK;
344 }
345
ReduceResize(struct KernelBase * self)346 int ReduceResize(struct KernelBase *self) {
347 NNACL_CHECK_NULL_RETURN_ERR(self);
348 ReduceStruct *reduce = (ReduceStruct *)self;
349
350 int ret = CheckReduceParameters(reduce);
351 if (ret != NNACL_OK) {
352 return ret;
353 }
354
355 ReduceDecideIfOnlyCopy(reduce);
356 ReduceCalculateTmpBufferSize(reduce);
357 ReduceCalculateInnerOuterSize(reduce);
358
359 if (reduce->num_axes_ == 1) {
360 self->thread_nr_ = self->UpdateThread(
361 TC_TYPE(PrimType_ReduceFusion, ((ReduceParameter *)reduce->base_.param_)->mode_),
362 reduce->inner_sizes_[Index0] * reduce->axis_sizes_[Index0],
363 reduce->inner_sizes_[Index0] * reduce->axis_sizes_[Index0], reduce->outer_sizes_[Index0], self->thread_nr_);
364 } else {
365 self->thread_nr_ = self->UpdateThread(TC_TYPE(PrimType_ReduceFusion, Reduce_Max + 1), 0, 0,
366 GetElementNum(self->out_[OUTPUT_INDEX]), self->thread_nr_);
367 }
368 return NNACL_OK;
369 }
370
ReduceCompute(struct KernelBase * self)371 int ReduceCompute(struct KernelBase *self) {
372 NNACL_CHECK_NULL_RETURN_ERR(self);
373 ReduceStruct *reduce = (ReduceStruct *)self;
374 NNACL_CHECK_FALSE(self->in_[FIRST_INPUT]->data_type_ != reduce->data_type_, NNACL_ERR);
375
376 if (reduce->only_copy_) {
377 return CopyReduceyInputToOutput(reduce);
378 }
379
380 int ret = MallocReduceTmpBuffer(reduce);
381 if (ret != NNACL_OK) {
382 FreeReduceTmpBuffer(reduce);
383 return ret;
384 }
385
386 reduce->src_data_ = self->in_[FIRST_INPUT]->data_;
387 reduce->handle_sum_square_(self);
388 for (int i = 0; i < reduce->num_axes_; i++) {
389 if (i != (reduce->num_axes_ - 1)) {
390 reduce->dst_data_ = reduce->data_buffers_[i];
391 } else {
392 reduce->dst_data_ = self->out_[FIRST_INPUT]->data_;
393 }
394 reduce->outer_size_ = reduce->outer_sizes_[i];
395 reduce->inner_size_ = reduce->inner_sizes_[i];
396 reduce->axis_size_ = reduce->axis_sizes_[i];
397 NNACL_CHECK_FALSE(reduce->axis_size_ == 0, NNACL_REDUCE_AXIS_SIZE_ERROR);
398
399 ret = self->env_->ParallelLaunch(self->env_->thread_pool_, ReduceImpl, self, self->thread_nr_);
400 if (ret != NNACL_OK) {
401 FreeReduceTmpBuffer(reduce);
402 return ret;
403 }
404 reduce->src_data_ = reduce->dst_data_;
405 }
406
407 ReduceParameter *param = (ReduceParameter *)reduce->base_.param_;
408 if (param->reduce_to_end_ && fabsf(param->coeff) > 1e-5) {
409 ret = reduce->calculate_coeff_(self);
410 }
411
412 FreeReduceTmpBuffer(reduce);
413 return ret;
414 }
415
CreateReduce(OpParameter * param,int data_type)416 KernelBase *CreateReduce(OpParameter *param, int data_type) {
417 ReduceStruct *reduce = (ReduceStruct *)malloc(sizeof(ReduceStruct));
418 NNACL_MALLOC_CHECK_NULL_RETURN_NULL(reduce);
419 memset(reduce, 0, sizeof(ReduceStruct));
420 reduce->data_type_ = data_type;
421 reduce->base_.Release = DefaultRelease;
422 reduce->base_.Prepare = ReducePrepare;
423 reduce->base_.Resize = ReduceResize;
424 reduce->base_.Compute = ReduceCompute;
425 reduce->handle_sum_square_ = HandleReduceASumAndSumSquare;
426 reduce->calculate_coeff_ = CalculateReduceCoeffOutput;
427 reduce->init_kernel_list_ = InitialReduceKernelList;
428 reduce->call_uint_ = CallReduceUnit;
429 return (KernelBase *)reduce;
430 }
431
432 REG_KERNEL_CREATOR(PrimType_ReduceFusion, kNumberTypeBool, CreateReduce)
433 REG_KERNEL_CREATOR(PrimType_ReduceFusion, kNumberTypeInt32, CreateReduce)
434 REG_KERNEL_CREATOR(PrimType_ReduceFusion, kNumberTypeFloat32, CreateReduce)
435