• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either convolutionress or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "nnacl/kernel/convolution_1x1.h"
18 #include "nnacl/fp32/pack_fp32.h"
19 #include "nnacl/base/conv1x1_base.h"
20 #include "nnacl/fp32/matmul_fp32.h"
21 
Conv1x1Run(void * cdata,int task_id,float l,float r)22 int Conv1x1Run(void *cdata, int task_id, float l, float r) {
23   Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)cdata;
24   NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
25   MatMulParameter *matmul = &conv_1x1->matmul_param_;
26 
27   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(task_id, conv_1x1->thread_stride_, NNACL_ERR);
28   int total_thead_stride_ = task_id * conv_1x1->thread_stride_;
29   int res_stride = matmul->col_ - total_thead_stride_;
30   int cur_oc = MSMIN(conv_1x1->thread_stride_, res_stride);
31   if (cur_oc <= 0) {
32     return NNACL_OK;
33   }
34 
35   TensorC *out_tensor = conv_1x1->conv_.base_.out_[OUTPUT_INDEX];
36   NNACL_CHECK_NULL_RETURN_ERR(out_tensor);
37   float *bias = conv_1x1->conv_.bias_data_ == NULL
38                   ? NULL
39                   : (float *)conv_1x1->conv_.bias_data_ + conv_1x1->thread_stride_ * task_id;
40   float *weight = (float *)conv_1x1->conv_.packed_weight_ + total_thead_stride_ * matmul->deep_;
41 
42   if (out_tensor->format_ == Format_NC4HW4) {
43     MatMulOpt(conv_1x1->pack_input_, weight, conv_1x1->output_ptr_ + total_thead_stride_ * matmul->row_, bias,
44               matmul->act_type_, matmul->deep_, matmul->row_, cur_oc, matmul->row_, OutType_NC4HW4);
45   } else {
46     MatMulOpt(conv_1x1->pack_input_, weight, conv_1x1->output_ptr_ + total_thead_stride_, bias, matmul->act_type_,
47               matmul->deep_, matmul->row_, cur_oc, matmul->col_, OutType_Nhwc);
48   }
49   return NNACL_OK;
50 }
51 
Conv1x1PackMatmulInput(const float * src_ptr,float * dst_ptr,int row,int col)52 void Conv1x1PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) {
53 #ifdef ENABLE_AVX
54   RowMajor2Col6Major(src_ptr, dst_ptr, row, col);
55 #elif defined(ENABLE_SSE)
56   RowMajor2Col4Major(src_ptr, dst_ptr, row, col);
57 #else
58   RowMajor2Col12Major(src_ptr, dst_ptr, row, col);
59 #endif
60 }
61 
Conv1x1RunHw(void * cdata,int task_id,float l,float r)62 int Conv1x1RunHw(void *cdata, int task_id, float l, float r) {
63   Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)cdata;
64   NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
65   MatMulParameter *matmul = &conv_1x1->matmul_param_;
66   TensorC *output_tensor = conv_1x1->conv_.base_.out_[OUTPUT_INDEX];
67   NNACL_CHECK_NULL_RETURN_ERR(output_tensor);
68 
69   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(task_id, conv_1x1->thread_stride_, NNACL_ERR);
70   int total_thead_stride_ = task_id * conv_1x1->thread_stride_;
71   int res_stride = matmul->row_ - total_thead_stride_;
72   int cur_hw_ = MSMIN(conv_1x1->thread_stride_, res_stride);
73   if (cur_hw_ <= 0) {
74     return NNACL_OK;
75   }
76 
77   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_thead_stride_, matmul->deep_, NNACL_ERR);
78   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(task_id, conv_1x1->row_tile_, NNACL_ERR);
79   int total_row_tile_ = task_id * conv_1x1->row_tile_;
80   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_row_tile_, matmul->deep_, NNACL_ERR);
81   float *thread_input_ptr = conv_1x1->input_ptr_ + total_thead_stride_ * matmul->deep_;
82   float *thread_pack_input = conv_1x1->pack_input_ + total_row_tile_ * matmul->deep_;
83   float *thread_output_ptr = NULL;
84   if (output_tensor->format_ != Format_NC4HW4) {
85     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_thead_stride_, matmul->col_, NNACL_ERR);
86     thread_output_ptr = conv_1x1->output_ptr_ + total_thead_stride_ * matmul->col_;
87   } else {
88     int col_min = MSMIN(matmul->col_, C4NUM);
89     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_thead_stride_, col_min, NNACL_ERR);
90     thread_output_ptr = conv_1x1->output_ptr_ + total_thead_stride_ * col_min;
91   }
92   float *cur_intput = thread_input_ptr;
93   float *cur_output = thread_output_ptr;
94   float *bias = (float *)conv_1x1->conv_.bias_data_;
95   for (int i = 0; i < cur_hw_; i += conv_1x1->row_tile_) {
96     int cur_rows = (cur_hw_ - i >= conv_1x1->row_tile_) ? conv_1x1->row_tile_ : (cur_hw_ - i);
97     Conv1x1PackMatmulInput(cur_intput, thread_pack_input, cur_rows, matmul->deep_);
98     if (output_tensor->format_ == Format_NC4HW4) {
99       MatMulOpt(thread_pack_input, (float *)conv_1x1->conv_.packed_weight_, cur_output, bias, matmul->act_type_,
100                 matmul->deep_, cur_rows, matmul->col_, matmul->row_, OutType_NC4HW4);
101       cur_output += conv_1x1->row_tile_ * MSMIN(matmul->col_, C4NUM);
102     } else {
103       MatMulOpt(thread_pack_input, (float *)conv_1x1->conv_.packed_weight_, cur_output, bias, matmul->act_type_,
104                 matmul->deep_, cur_rows, matmul->col_, matmul->col_, OutType_Nhwc);
105       cur_output += conv_1x1->row_tile_ * matmul->col_;
106     }
107     cur_intput += conv_1x1->row_tile_ * matmul->deep_;
108   }
109 
110   return NNACL_OK;
111 }
112 
Conv1x1PackWeight(ConvolutionBaseStruct * conv)113 void Conv1x1PackWeight(ConvolutionBaseStruct *conv) {
114   TensorC *filter_tensor = conv->base_.in_[SECOND_INPUT];
115   NNACL_CHECK_NULL_RETURN_VOID(filter_tensor);
116   ConvComputeParam *compute = &conv->compute_;
117   NNACL_CHECK_NULL_RETURN_VOID(compute);
118 
119   if (compute->in_c_ <= 0 || compute->out_c_ <= 0) {
120     return;
121   }
122 
123   void *origin_weight = conv->base_.train_session_ ? filter_tensor->data_ : conv->origin_weight_;
124   NNACL_CHECK_NULL_RETURN_VOID(origin_weight);
125 
126 #ifdef ENABLE_AVX
127   RowMajor2Col16Major((float *)origin_weight, (float *)conv->packed_weight_, compute->out_c_, compute->in_c_);
128 #elif defined(ENABLE_ARM32)
129   RowMajor2Col4Major((float *)origin_weight, (float *)conv->packed_weight_, compute->out_c_, compute->in_c_);
130 #else
131   RowMajor2Col8Major((float *)origin_weight, (float *)conv->packed_weight_, compute->out_c_, compute->in_c_);
132 #endif
133 }
134 
Conv1x1MallocWeightBiasData(ConvolutionBaseStruct * conv)135 int Conv1x1MallocWeightBiasData(ConvolutionBaseStruct *conv) {
136   Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)conv;
137   NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
138 
139   int size = conv->compute_.in_c_ * UP_ROUND(conv->compute_.out_c_, conv_1x1->col_tile_) * sizeof(float);
140   if (!conv->base_.train_session_) {
141     conv->packed_weight_ = ConvBaseGetConvPackWeightData(conv, size);
142     NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->packed_weight_);
143   }
144 
145   if (conv->base_.in_size_ == THREE_TENSOR) {
146     size = UP_ROUND(conv->compute_.out_c_, conv_1x1->col_tile_) * sizeof(float);
147     conv->bias_data_ = conv->base_.env_->Alloc(conv->base_.env_->allocator_, size);
148     NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->bias_data_);
149     memset(conv->bias_data_, 0, size);
150   }
151   return NNACL_OK;
152 }
153 
Conv1x1FreeTmpBuffer(Convolution1x1Struct * conv_1x1)154 void Conv1x1FreeTmpBuffer(Convolution1x1Struct *conv_1x1) {
155   if (conv_1x1->pre_trans_input_ && conv_1x1->input_ptr_ != NULL) {
156     conv_1x1->conv_.base_.env_->Free(conv_1x1->conv_.base_.env_->allocator_, conv_1x1->input_ptr_);
157     conv_1x1->input_ptr_ = NULL;
158   }
159   return;
160 }
161 
InitConv1x1MatmulParam(Convolution1x1Struct * conv_1x1)162 int InitConv1x1MatmulParam(Convolution1x1Struct *conv_1x1) {
163   ConvParameter *conv_param = (ConvParameter *)conv_1x1->conv_.base_.param_;
164   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_param->output_h_, conv_param->output_w_, NNACL_ERR);
165   conv_1x1->matmul_param_.row_ = conv_param->output_h_ * conv_param->output_w_;
166   conv_1x1->matmul_param_.col_ = conv_param->output_channel_;
167   conv_1x1->matmul_param_.deep_ = conv_param->input_channel_;
168   conv_1x1->matmul_param_.row_align_ = UP_ROUND(conv_1x1->matmul_param_.row_, conv_1x1->row_tile_);
169   conv_1x1->matmul_param_.col_align_ = UP_ROUND(conv_1x1->matmul_param_.col_, conv_1x1->col_tile_);
170   conv_1x1->matmul_param_.act_type_ = conv_param->act_type_;
171   return NNACL_OK;
172 }
173 
InitConv1x1Param(Convolution1x1Struct * conv_1x1)174 int InitConv1x1Param(Convolution1x1Struct *conv_1x1) {
175   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->row_tile_, conv_1x1->conv_.base_.thread_nr_, NNACL_ERR);
176   if ((conv_1x1->matmul_param_.row_ > (conv_1x1->row_tile_ * conv_1x1->conv_.base_.thread_nr_)) &&
177       (conv_1x1->matmul_param_.row_ > conv_1x1->matmul_param_.col_)) {
178     conv_1x1->multi_thread_by_hw_ = true;
179     conv_1x1->conv_.base_.thread_nr_ =
180       MSMIN(conv_1x1->conv_.base_.thread_nr_, UP_DIV(conv_1x1->matmul_param_.row_, conv_1x1->row_tile_));
181     if (conv_1x1->conv_.base_.thread_nr_ <= 0) {
182       return NNACL_ERR;
183     }
184     conv_1x1->thread_stride_ =
185       UP_DIV(UP_DIV(conv_1x1->matmul_param_.row_, conv_1x1->row_tile_), conv_1x1->conv_.base_.thread_nr_) *
186       conv_1x1->row_tile_;
187   } else {
188     conv_1x1->multi_thread_by_hw_ = false;
189     conv_1x1->conv_.base_.thread_nr_ =
190       MSMIN(conv_1x1->conv_.base_.thread_nr_, UP_DIV(conv_1x1->matmul_param_.col_, conv_1x1->col_tile_));
191     if (conv_1x1->conv_.base_.thread_nr_ <= 0) {
192       return NNACL_ERR;
193     }
194     conv_1x1->thread_stride_ =
195       UP_DIV(UP_DIV(conv_1x1->matmul_param_.col_, conv_1x1->col_tile_), conv_1x1->conv_.base_.thread_nr_) *
196       conv_1x1->col_tile_;
197   }
198 
199   ConvParameter *conv_param = (ConvParameter *)conv_1x1->conv_.base_.param_;
200   conv_1x1->pre_trans_input_ =
201     (conv_param->pad_u_ != 0 || conv_param->pad_l_ != 0 || conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1);
202   if (conv_1x1->pre_trans_input_) {
203     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->matmul_param_.row_, conv_1x1->matmul_param_.deep_, NNACL_ERR);
204     conv_1x1->input_ptr_ = (float *)(conv_1x1->conv_.base_.env_->Alloc(
205       conv_1x1->conv_.base_.env_->allocator_,
206       conv_1x1->matmul_param_.row_ * conv_1x1->matmul_param_.deep_ * sizeof(float)));
207     NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_1x1->input_ptr_);
208     memset(conv_1x1->input_ptr_, 0, conv_1x1->matmul_param_.row_ * conv_1x1->matmul_param_.deep_ * sizeof(float));
209   }
210 
211   return NNACL_OK;
212 }
213 
Convolution1x1Resize(KernelBase * self)214 int Convolution1x1Resize(KernelBase *self) {
215   Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)self;
216   NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
217 
218   Conv1x1FreeTmpBuffer(conv_1x1);
219   int error_code = ConvBasePrepare(&conv_1x1->conv_);
220   if (error_code != NNACL_OK) {
221     return error_code;
222   }
223 
224   error_code = InitConv1x1MatmulParam(conv_1x1);
225   if (error_code != NNACL_OK) {
226     return error_code;
227   }
228 
229   error_code = InitConv1x1Param(conv_1x1);
230   if (error_code != NNACL_OK) {
231     return error_code;
232   }
233 
234   return NNACL_OK;
235 }
236 
Convolution1x1Prepare(KernelBase * self)237 int Convolution1x1Prepare(KernelBase *self) {
238   NNACL_CHECK_FALSE(self->in_size_ < TWO_TENSOR, NNACL_INPUT_TENSOR_ERROR);
239   NNACL_CHECK_FALSE(self->out_size_ < ONE_TENSOR, NNACL_OUTPUT_TENSOR_ERROR);
240 
241   Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)self;
242   NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
243 
244 #ifdef ENABLE_AVX
245   conv_1x1->row_tile_ = C6NUM;
246   conv_1x1->col_tile_ = C16NUM;
247 #elif defined(ENABLE_SSE)
248   conv_1x1->row_tile_ = C4NUM;
249   conv_1x1->col_tile_ = C8NUM;
250 #elif defined(ENABLE_ARM32)
251   conv_1x1->row_tile_ = C12NUM;
252   conv_1x1->col_tile_ = C4NUM;
253 #else
254   conv_1x1->row_tile_ = C12NUM;
255   conv_1x1->col_tile_ = C8NUM;
256 #endif
257 
258   if (self->train_session_) {
259     int output_tile_size = UP_ROUND(conv_1x1->conv_.compute_.out_c_, conv_1x1->col_tile_);
260     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->conv_.compute_.in_c_, output_tile_size, NNACL_ERR);
261     size_t size = conv_1x1->conv_.compute_.in_c_ * output_tile_size * sizeof(float);
262     conv_1x1->conv_.base_.work_size_ = size;
263   }
264 
265   int error_code = ConvBaseInitConvWeightBias(&conv_1x1->conv_);
266   if (error_code != NNACL_OK) {
267     return error_code;
268   }
269   return NNACL_OK;
270 }
271 
Convolution1x1Release(KernelBase * self)272 int Convolution1x1Release(KernelBase *self) {
273   Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)self;
274   NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
275   Conv1x1FreeTmpBuffer(conv_1x1);
276   ConvBaseRelease(&conv_1x1->conv_);
277   return NNACL_OK;
278 }
279 
Convolution1x1Compute(KernelBase * self)280 int Convolution1x1Compute(KernelBase *self) {
281   Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)self;
282   NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
283   ConvParameter *conv_param = (ConvParameter *)self->param_;
284   NNACL_CHECK_NULL_RETURN_ERR(conv_param);
285 
286   TensorC *input_tensor = self->in_[FIRST_INPUT];
287   NNACL_CHECK_NULL_RETURN_ERR(input_tensor);
288   TensorC *output_tensor = self->out_[OUTPUT_INDEX];
289   NNACL_CHECK_NULL_RETURN_ERR(output_tensor);
290 
291   float *src_in = (float *)input_tensor->data_;
292   NNACL_CHECK_NULL_RETURN_ERR(src_in);
293   float *src_out = (float *)output_tensor->data_;
294   NNACL_CHECK_NULL_RETURN_ERR(src_out);
295 
296   int pack_input_size = 0;
297   if (conv_1x1->multi_thread_by_hw_) {
298     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->conv_.base_.thread_nr_, conv_1x1->row_tile_, NNACL_ERR);
299     int total_row_tile_ = conv_1x1->conv_.base_.thread_nr_ * conv_1x1->row_tile_;
300     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_row_tile_, conv_1x1->matmul_param_.deep_, NNACL_ERR);
301     pack_input_size = total_row_tile_ * conv_1x1->matmul_param_.deep_;
302   } else {
303     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->matmul_param_.row_align_, conv_1x1->matmul_param_.deep_, NNACL_ERR);
304     pack_input_size = conv_1x1->matmul_param_.row_align_ * conv_1x1->matmul_param_.deep_;
305   }
306   conv_1x1->pack_input_ =
307     (float *)conv_1x1->conv_.base_.env_->Alloc(conv_1x1->conv_.base_.env_->allocator_, pack_input_size * sizeof(float));
308   NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_1x1->pack_input_);
309 
310   int ret = ConvBaseRepackWeight(&conv_1x1->conv_);
311   if (ret != NNACL_OK) {
312     return ret;
313   }
314 
315   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->matmul_param_.row_, conv_1x1->matmul_param_.col_, NNACL_ERR);
316   int matmul_size = conv_1x1->matmul_param_.row_ * conv_1x1->matmul_param_.col_;
317   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_param->input_batch_ - 1, matmul_size, NNACL_ERR);
318   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_param->input_h_, conv_param->input_w_, NNACL_ERR);
319   int conv_input_hw = conv_param->input_h_ * conv_param->input_w_;
320   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_input_hw, conv_param->input_channel_, NNACL_ERR);
321   int conv_input_bhw = conv_input_hw * conv_param->input_channel_;
322   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_param->input_batch_ - 1, conv_input_bhw, NNACL_ERR);
323   for (int batch_index = 0; batch_index < conv_param->input_batch_; batch_index++) {
324     conv_1x1->output_ptr_ = src_out + batch_index * matmul_size;
325     float *tmp_in = src_in + batch_index * conv_input_bhw;
326     if (conv_1x1->pre_trans_input_) {
327       Conv1x1InputPack(tmp_in, conv_1x1->input_ptr_, conv_param, sizeof(float));
328     } else {
329       conv_1x1->input_ptr_ = tmp_in;
330     }
331     if (conv_1x1->multi_thread_by_hw_) {
332       ret = self->env_->ParallelLaunch(self->env_->thread_pool_, Conv1x1RunHw, self, self->thread_nr_);
333     } else {
334       Conv1x1PackMatmulInput(conv_1x1->input_ptr_, conv_1x1->pack_input_, conv_1x1->matmul_param_.row_,
335                              conv_1x1->matmul_param_.deep_);
336       ret = self->env_->ParallelLaunch(self->env_->thread_pool_, Conv1x1Run, self, self->thread_nr_);
337     }
338     if (ret != NNACL_OK) {
339       break;
340     }
341   }
342 
343   if (conv_1x1->pack_input_ != NULL) {
344     self->env_->Free(self->env_->allocator_, conv_1x1->pack_input_);
345     conv_1x1->pack_input_ = NULL;
346   }
347   return ret;
348 }
349 
CreateConvolution1x1(ConvParameter * conv_param)350 ConvolutionBaseStruct *CreateConvolution1x1(ConvParameter *conv_param) {
351   Convolution1x1Struct *conv1x1 = (Convolution1x1Struct *)malloc(sizeof(Convolution1x1Struct));
352   NNACL_MALLOC_CHECK_NULL_RETURN_NULL(conv1x1);
353   memset(conv1x1, 0, sizeof(Convolution1x1Struct));
354 
355   conv1x1->conv_.is_sharing_pack_ = false;
356   conv1x1->conv_.malloc_weight_bias_ = Conv1x1MallocWeightBiasData;
357   conv1x1->conv_.pack_weight_ = Conv1x1PackWeight;
358 
359   conv1x1->conv_.base_.Resize = Convolution1x1Resize;
360   conv1x1->conv_.base_.Prepare = Convolution1x1Prepare;
361   conv1x1->conv_.base_.Release = Convolution1x1Release;
362   conv1x1->conv_.base_.Compute = Convolution1x1Compute;
363 
364   return (ConvolutionBaseStruct *)conv1x1;
365 }
366