1 /**
2 * Copyright 2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either convolutionress or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "nnacl/kernel/convolution_1x1.h"
18 #include "nnacl/fp32/pack_fp32.h"
19 #include "nnacl/base/conv1x1_base.h"
20 #include "nnacl/fp32/matmul_fp32.h"
21
Conv1x1Run(void * cdata,int task_id,float l,float r)22 int Conv1x1Run(void *cdata, int task_id, float l, float r) {
23 Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)cdata;
24 NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
25 MatMulParameter *matmul = &conv_1x1->matmul_param_;
26
27 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(task_id, conv_1x1->thread_stride_, NNACL_ERR);
28 int total_thead_stride_ = task_id * conv_1x1->thread_stride_;
29 int res_stride = matmul->col_ - total_thead_stride_;
30 int cur_oc = MSMIN(conv_1x1->thread_stride_, res_stride);
31 if (cur_oc <= 0) {
32 return NNACL_OK;
33 }
34
35 TensorC *out_tensor = conv_1x1->conv_.base_.out_[OUTPUT_INDEX];
36 NNACL_CHECK_NULL_RETURN_ERR(out_tensor);
37 float *bias = conv_1x1->conv_.bias_data_ == NULL
38 ? NULL
39 : (float *)conv_1x1->conv_.bias_data_ + conv_1x1->thread_stride_ * task_id;
40 float *weight = (float *)conv_1x1->conv_.packed_weight_ + total_thead_stride_ * matmul->deep_;
41
42 if (out_tensor->format_ == Format_NC4HW4) {
43 MatMulOpt(conv_1x1->pack_input_, weight, conv_1x1->output_ptr_ + total_thead_stride_ * matmul->row_, bias,
44 matmul->act_type_, matmul->deep_, matmul->row_, cur_oc, matmul->row_, OutType_NC4HW4);
45 } else {
46 MatMulOpt(conv_1x1->pack_input_, weight, conv_1x1->output_ptr_ + total_thead_stride_, bias, matmul->act_type_,
47 matmul->deep_, matmul->row_, cur_oc, matmul->col_, OutType_Nhwc);
48 }
49 return NNACL_OK;
50 }
51
Conv1x1PackMatmulInput(const float * src_ptr,float * dst_ptr,int row,int col)52 void Conv1x1PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) {
53 #ifdef ENABLE_AVX
54 RowMajor2Col6Major(src_ptr, dst_ptr, row, col);
55 #elif defined(ENABLE_SSE)
56 RowMajor2Col4Major(src_ptr, dst_ptr, row, col);
57 #else
58 RowMajor2Col12Major(src_ptr, dst_ptr, row, col);
59 #endif
60 }
61
Conv1x1RunHw(void * cdata,int task_id,float l,float r)62 int Conv1x1RunHw(void *cdata, int task_id, float l, float r) {
63 Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)cdata;
64 NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
65 MatMulParameter *matmul = &conv_1x1->matmul_param_;
66 TensorC *output_tensor = conv_1x1->conv_.base_.out_[OUTPUT_INDEX];
67 NNACL_CHECK_NULL_RETURN_ERR(output_tensor);
68
69 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(task_id, conv_1x1->thread_stride_, NNACL_ERR);
70 int total_thead_stride_ = task_id * conv_1x1->thread_stride_;
71 int res_stride = matmul->row_ - total_thead_stride_;
72 int cur_hw_ = MSMIN(conv_1x1->thread_stride_, res_stride);
73 if (cur_hw_ <= 0) {
74 return NNACL_OK;
75 }
76
77 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_thead_stride_, matmul->deep_, NNACL_ERR);
78 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(task_id, conv_1x1->row_tile_, NNACL_ERR);
79 int total_row_tile_ = task_id * conv_1x1->row_tile_;
80 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_row_tile_, matmul->deep_, NNACL_ERR);
81 float *thread_input_ptr = conv_1x1->input_ptr_ + total_thead_stride_ * matmul->deep_;
82 float *thread_pack_input = conv_1x1->pack_input_ + total_row_tile_ * matmul->deep_;
83 float *thread_output_ptr = NULL;
84 if (output_tensor->format_ != Format_NC4HW4) {
85 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_thead_stride_, matmul->col_, NNACL_ERR);
86 thread_output_ptr = conv_1x1->output_ptr_ + total_thead_stride_ * matmul->col_;
87 } else {
88 int col_min = MSMIN(matmul->col_, C4NUM);
89 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_thead_stride_, col_min, NNACL_ERR);
90 thread_output_ptr = conv_1x1->output_ptr_ + total_thead_stride_ * col_min;
91 }
92 float *cur_intput = thread_input_ptr;
93 float *cur_output = thread_output_ptr;
94 float *bias = (float *)conv_1x1->conv_.bias_data_;
95 for (int i = 0; i < cur_hw_; i += conv_1x1->row_tile_) {
96 int cur_rows = (cur_hw_ - i >= conv_1x1->row_tile_) ? conv_1x1->row_tile_ : (cur_hw_ - i);
97 Conv1x1PackMatmulInput(cur_intput, thread_pack_input, cur_rows, matmul->deep_);
98 if (output_tensor->format_ == Format_NC4HW4) {
99 MatMulOpt(thread_pack_input, (float *)conv_1x1->conv_.packed_weight_, cur_output, bias, matmul->act_type_,
100 matmul->deep_, cur_rows, matmul->col_, matmul->row_, OutType_NC4HW4);
101 cur_output += conv_1x1->row_tile_ * MSMIN(matmul->col_, C4NUM);
102 } else {
103 MatMulOpt(thread_pack_input, (float *)conv_1x1->conv_.packed_weight_, cur_output, bias, matmul->act_type_,
104 matmul->deep_, cur_rows, matmul->col_, matmul->col_, OutType_Nhwc);
105 cur_output += conv_1x1->row_tile_ * matmul->col_;
106 }
107 cur_intput += conv_1x1->row_tile_ * matmul->deep_;
108 }
109
110 return NNACL_OK;
111 }
112
Conv1x1PackWeight(ConvolutionBaseStruct * conv)113 void Conv1x1PackWeight(ConvolutionBaseStruct *conv) {
114 TensorC *filter_tensor = conv->base_.in_[SECOND_INPUT];
115 NNACL_CHECK_NULL_RETURN_VOID(filter_tensor);
116 ConvComputeParam *compute = &conv->compute_;
117 NNACL_CHECK_NULL_RETURN_VOID(compute);
118
119 if (compute->in_c_ <= 0 || compute->out_c_ <= 0) {
120 return;
121 }
122
123 void *origin_weight = conv->base_.train_session_ ? filter_tensor->data_ : conv->origin_weight_;
124 NNACL_CHECK_NULL_RETURN_VOID(origin_weight);
125
126 #ifdef ENABLE_AVX
127 RowMajor2Col16Major((float *)origin_weight, (float *)conv->packed_weight_, compute->out_c_, compute->in_c_);
128 #elif defined(ENABLE_ARM32)
129 RowMajor2Col4Major((float *)origin_weight, (float *)conv->packed_weight_, compute->out_c_, compute->in_c_);
130 #else
131 RowMajor2Col8Major((float *)origin_weight, (float *)conv->packed_weight_, compute->out_c_, compute->in_c_);
132 #endif
133 }
134
Conv1x1MallocWeightBiasData(ConvolutionBaseStruct * conv)135 int Conv1x1MallocWeightBiasData(ConvolutionBaseStruct *conv) {
136 Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)conv;
137 NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
138
139 int size = conv->compute_.in_c_ * UP_ROUND(conv->compute_.out_c_, conv_1x1->col_tile_) * sizeof(float);
140 if (!conv->base_.train_session_) {
141 conv->packed_weight_ = ConvBaseGetConvPackWeightData(conv, size);
142 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->packed_weight_);
143 }
144
145 if (conv->base_.in_size_ == THREE_TENSOR) {
146 size = UP_ROUND(conv->compute_.out_c_, conv_1x1->col_tile_) * sizeof(float);
147 conv->bias_data_ = conv->base_.env_->Alloc(conv->base_.env_->allocator_, size);
148 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->bias_data_);
149 memset(conv->bias_data_, 0, size);
150 }
151 return NNACL_OK;
152 }
153
Conv1x1FreeTmpBuffer(Convolution1x1Struct * conv_1x1)154 void Conv1x1FreeTmpBuffer(Convolution1x1Struct *conv_1x1) {
155 if (conv_1x1->pre_trans_input_ && conv_1x1->input_ptr_ != NULL) {
156 conv_1x1->conv_.base_.env_->Free(conv_1x1->conv_.base_.env_->allocator_, conv_1x1->input_ptr_);
157 conv_1x1->input_ptr_ = NULL;
158 }
159 return;
160 }
161
InitConv1x1MatmulParam(Convolution1x1Struct * conv_1x1)162 int InitConv1x1MatmulParam(Convolution1x1Struct *conv_1x1) {
163 ConvParameter *conv_param = (ConvParameter *)conv_1x1->conv_.base_.param_;
164 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_param->output_h_, conv_param->output_w_, NNACL_ERR);
165 conv_1x1->matmul_param_.row_ = conv_param->output_h_ * conv_param->output_w_;
166 conv_1x1->matmul_param_.col_ = conv_param->output_channel_;
167 conv_1x1->matmul_param_.deep_ = conv_param->input_channel_;
168 conv_1x1->matmul_param_.row_align_ = UP_ROUND(conv_1x1->matmul_param_.row_, conv_1x1->row_tile_);
169 conv_1x1->matmul_param_.col_align_ = UP_ROUND(conv_1x1->matmul_param_.col_, conv_1x1->col_tile_);
170 conv_1x1->matmul_param_.act_type_ = conv_param->act_type_;
171 return NNACL_OK;
172 }
173
InitConv1x1Param(Convolution1x1Struct * conv_1x1)174 int InitConv1x1Param(Convolution1x1Struct *conv_1x1) {
175 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->row_tile_, conv_1x1->conv_.base_.thread_nr_, NNACL_ERR);
176 if ((conv_1x1->matmul_param_.row_ > (conv_1x1->row_tile_ * conv_1x1->conv_.base_.thread_nr_)) &&
177 (conv_1x1->matmul_param_.row_ > conv_1x1->matmul_param_.col_)) {
178 conv_1x1->multi_thread_by_hw_ = true;
179 conv_1x1->conv_.base_.thread_nr_ =
180 MSMIN(conv_1x1->conv_.base_.thread_nr_, UP_DIV(conv_1x1->matmul_param_.row_, conv_1x1->row_tile_));
181 if (conv_1x1->conv_.base_.thread_nr_ <= 0) {
182 return NNACL_ERR;
183 }
184 conv_1x1->thread_stride_ =
185 UP_DIV(UP_DIV(conv_1x1->matmul_param_.row_, conv_1x1->row_tile_), conv_1x1->conv_.base_.thread_nr_) *
186 conv_1x1->row_tile_;
187 } else {
188 conv_1x1->multi_thread_by_hw_ = false;
189 conv_1x1->conv_.base_.thread_nr_ =
190 MSMIN(conv_1x1->conv_.base_.thread_nr_, UP_DIV(conv_1x1->matmul_param_.col_, conv_1x1->col_tile_));
191 if (conv_1x1->conv_.base_.thread_nr_ <= 0) {
192 return NNACL_ERR;
193 }
194 conv_1x1->thread_stride_ =
195 UP_DIV(UP_DIV(conv_1x1->matmul_param_.col_, conv_1x1->col_tile_), conv_1x1->conv_.base_.thread_nr_) *
196 conv_1x1->col_tile_;
197 }
198
199 ConvParameter *conv_param = (ConvParameter *)conv_1x1->conv_.base_.param_;
200 conv_1x1->pre_trans_input_ =
201 (conv_param->pad_u_ != 0 || conv_param->pad_l_ != 0 || conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1);
202 if (conv_1x1->pre_trans_input_) {
203 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->matmul_param_.row_, conv_1x1->matmul_param_.deep_, NNACL_ERR);
204 conv_1x1->input_ptr_ = (float *)(conv_1x1->conv_.base_.env_->Alloc(
205 conv_1x1->conv_.base_.env_->allocator_,
206 conv_1x1->matmul_param_.row_ * conv_1x1->matmul_param_.deep_ * sizeof(float)));
207 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_1x1->input_ptr_);
208 memset(conv_1x1->input_ptr_, 0, conv_1x1->matmul_param_.row_ * conv_1x1->matmul_param_.deep_ * sizeof(float));
209 }
210
211 return NNACL_OK;
212 }
213
Convolution1x1Resize(KernelBase * self)214 int Convolution1x1Resize(KernelBase *self) {
215 Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)self;
216 NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
217
218 Conv1x1FreeTmpBuffer(conv_1x1);
219 int error_code = ConvBasePrepare(&conv_1x1->conv_);
220 if (error_code != NNACL_OK) {
221 return error_code;
222 }
223
224 error_code = InitConv1x1MatmulParam(conv_1x1);
225 if (error_code != NNACL_OK) {
226 return error_code;
227 }
228
229 error_code = InitConv1x1Param(conv_1x1);
230 if (error_code != NNACL_OK) {
231 return error_code;
232 }
233
234 return NNACL_OK;
235 }
236
Convolution1x1Prepare(KernelBase * self)237 int Convolution1x1Prepare(KernelBase *self) {
238 NNACL_CHECK_FALSE(self->in_size_ < TWO_TENSOR, NNACL_INPUT_TENSOR_ERROR);
239 NNACL_CHECK_FALSE(self->out_size_ < ONE_TENSOR, NNACL_OUTPUT_TENSOR_ERROR);
240
241 Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)self;
242 NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
243
244 #ifdef ENABLE_AVX
245 conv_1x1->row_tile_ = C6NUM;
246 conv_1x1->col_tile_ = C16NUM;
247 #elif defined(ENABLE_SSE)
248 conv_1x1->row_tile_ = C4NUM;
249 conv_1x1->col_tile_ = C8NUM;
250 #elif defined(ENABLE_ARM32)
251 conv_1x1->row_tile_ = C12NUM;
252 conv_1x1->col_tile_ = C4NUM;
253 #else
254 conv_1x1->row_tile_ = C12NUM;
255 conv_1x1->col_tile_ = C8NUM;
256 #endif
257
258 if (self->train_session_) {
259 int output_tile_size = UP_ROUND(conv_1x1->conv_.compute_.out_c_, conv_1x1->col_tile_);
260 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->conv_.compute_.in_c_, output_tile_size, NNACL_ERR);
261 size_t size = conv_1x1->conv_.compute_.in_c_ * output_tile_size * sizeof(float);
262 conv_1x1->conv_.base_.work_size_ = size;
263 }
264
265 int error_code = ConvBaseInitConvWeightBias(&conv_1x1->conv_);
266 if (error_code != NNACL_OK) {
267 return error_code;
268 }
269 return NNACL_OK;
270 }
271
Convolution1x1Release(KernelBase * self)272 int Convolution1x1Release(KernelBase *self) {
273 Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)self;
274 NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
275 Conv1x1FreeTmpBuffer(conv_1x1);
276 ConvBaseRelease(&conv_1x1->conv_);
277 return NNACL_OK;
278 }
279
Convolution1x1Compute(KernelBase * self)280 int Convolution1x1Compute(KernelBase *self) {
281 Convolution1x1Struct *conv_1x1 = (Convolution1x1Struct *)self;
282 NNACL_CHECK_NULL_RETURN_ERR(conv_1x1);
283 ConvParameter *conv_param = (ConvParameter *)self->param_;
284 NNACL_CHECK_NULL_RETURN_ERR(conv_param);
285
286 TensorC *input_tensor = self->in_[FIRST_INPUT];
287 NNACL_CHECK_NULL_RETURN_ERR(input_tensor);
288 TensorC *output_tensor = self->out_[OUTPUT_INDEX];
289 NNACL_CHECK_NULL_RETURN_ERR(output_tensor);
290
291 float *src_in = (float *)input_tensor->data_;
292 NNACL_CHECK_NULL_RETURN_ERR(src_in);
293 float *src_out = (float *)output_tensor->data_;
294 NNACL_CHECK_NULL_RETURN_ERR(src_out);
295
296 int pack_input_size = 0;
297 if (conv_1x1->multi_thread_by_hw_) {
298 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->conv_.base_.thread_nr_, conv_1x1->row_tile_, NNACL_ERR);
299 int total_row_tile_ = conv_1x1->conv_.base_.thread_nr_ * conv_1x1->row_tile_;
300 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(total_row_tile_, conv_1x1->matmul_param_.deep_, NNACL_ERR);
301 pack_input_size = total_row_tile_ * conv_1x1->matmul_param_.deep_;
302 } else {
303 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->matmul_param_.row_align_, conv_1x1->matmul_param_.deep_, NNACL_ERR);
304 pack_input_size = conv_1x1->matmul_param_.row_align_ * conv_1x1->matmul_param_.deep_;
305 }
306 conv_1x1->pack_input_ =
307 (float *)conv_1x1->conv_.base_.env_->Alloc(conv_1x1->conv_.base_.env_->allocator_, pack_input_size * sizeof(float));
308 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_1x1->pack_input_);
309
310 int ret = ConvBaseRepackWeight(&conv_1x1->conv_);
311 if (ret != NNACL_OK) {
312 return ret;
313 }
314
315 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_1x1->matmul_param_.row_, conv_1x1->matmul_param_.col_, NNACL_ERR);
316 int matmul_size = conv_1x1->matmul_param_.row_ * conv_1x1->matmul_param_.col_;
317 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_param->input_batch_ - 1, matmul_size, NNACL_ERR);
318 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_param->input_h_, conv_param->input_w_, NNACL_ERR);
319 int conv_input_hw = conv_param->input_h_ * conv_param->input_w_;
320 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_input_hw, conv_param->input_channel_, NNACL_ERR);
321 int conv_input_bhw = conv_input_hw * conv_param->input_channel_;
322 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_param->input_batch_ - 1, conv_input_bhw, NNACL_ERR);
323 for (int batch_index = 0; batch_index < conv_param->input_batch_; batch_index++) {
324 conv_1x1->output_ptr_ = src_out + batch_index * matmul_size;
325 float *tmp_in = src_in + batch_index * conv_input_bhw;
326 if (conv_1x1->pre_trans_input_) {
327 Conv1x1InputPack(tmp_in, conv_1x1->input_ptr_, conv_param, sizeof(float));
328 } else {
329 conv_1x1->input_ptr_ = tmp_in;
330 }
331 if (conv_1x1->multi_thread_by_hw_) {
332 ret = self->env_->ParallelLaunch(self->env_->thread_pool_, Conv1x1RunHw, self, self->thread_nr_);
333 } else {
334 Conv1x1PackMatmulInput(conv_1x1->input_ptr_, conv_1x1->pack_input_, conv_1x1->matmul_param_.row_,
335 conv_1x1->matmul_param_.deep_);
336 ret = self->env_->ParallelLaunch(self->env_->thread_pool_, Conv1x1Run, self, self->thread_nr_);
337 }
338 if (ret != NNACL_OK) {
339 break;
340 }
341 }
342
343 if (conv_1x1->pack_input_ != NULL) {
344 self->env_->Free(self->env_->allocator_, conv_1x1->pack_input_);
345 conv_1x1->pack_input_ = NULL;
346 }
347 return ret;
348 }
349
CreateConvolution1x1(ConvParameter * conv_param)350 ConvolutionBaseStruct *CreateConvolution1x1(ConvParameter *conv_param) {
351 Convolution1x1Struct *conv1x1 = (Convolution1x1Struct *)malloc(sizeof(Convolution1x1Struct));
352 NNACL_MALLOC_CHECK_NULL_RETURN_NULL(conv1x1);
353 memset(conv1x1, 0, sizeof(Convolution1x1Struct));
354
355 conv1x1->conv_.is_sharing_pack_ = false;
356 conv1x1->conv_.malloc_weight_bias_ = Conv1x1MallocWeightBiasData;
357 conv1x1->conv_.pack_weight_ = Conv1x1PackWeight;
358
359 conv1x1->conv_.base_.Resize = Convolution1x1Resize;
360 conv1x1->conv_.base_.Prepare = Convolution1x1Prepare;
361 conv1x1->conv_.base_.Release = Convolution1x1Release;
362 conv1x1->conv_.base_.Compute = Convolution1x1Compute;
363
364 return (ConvolutionBaseStruct *)conv1x1;
365 }
366