• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h"
18 #include "nnacl/base/conv1x1_base.h"
19 #include "nnacl/fp16/conv_fp16.h"
20 #include "nnacl/fp16/cast_fp16.h"
21 #include "nnacl/fp16/pack_fp16.h"
22 #include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
23 #include "include/errorcode.h"
24 
25 using mindspore::lite::RET_ERROR;
26 using mindspore::lite::RET_MEMORY_FAILED;
27 using mindspore::lite::RET_OK;
28 
29 namespace mindspore::kernel {
InitMatmulParam()30 int Convolution1x1FP16CPUKernel::InitMatmulParam() {
31   matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
32   matmul_param_->col_ = conv_param_->output_channel_;
33   matmul_param_->deep_ = conv_param_->input_channel_;
34   matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
35   matmul_param_->col_align_ = UP_ROUND(matmul_param_->col_, col_tile_);
36   matmul_param_->act_type_ = conv_param_->act_type_;
37   return RET_OK;
38 }
39 
~Convolution1x1FP16CPUKernel()40 Convolution1x1FP16CPUKernel::~Convolution1x1FP16CPUKernel() {
41   FreeTmpBuffer();
42   if (matmul_param_ != nullptr) {
43     delete matmul_param_;
44     matmul_param_ = nullptr;
45   }
46   return;
47 }
48 
InitConv1x1Param()49 int Convolution1x1FP16CPUKernel::InitConv1x1Param() {
50   pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
51                       conv_param_->stride_w_ != 1);
52 
53   if ((matmul_param_->row_ > (row_tile_ * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
54     multi_thread_by_hw_ = true;
55     thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_tile_));
56     if (thread_count_ <= 0) {
57       MS_LOG(ERROR) << "thread_count_ must be greater than 0!";
58       return RET_ERROR;
59     }
60     thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, row_tile_), thread_count_) * row_tile_;
61   } else {
62     multi_thread_by_hw_ = false;
63     thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile_));
64     if (thread_count_ <= 0) {
65       MS_LOG(ERROR) << "thread_count_ must be greater than 0!";
66       return RET_ERROR;
67     }
68     thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile_), thread_count_) * col_tile_;
69   }
70   matmul_param_->op_parameter_.thread_num_ = thread_count_;
71 
72   if (pre_trans_input_) {
73     input_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t)));
74     if (input_ptr_ == nullptr) {
75       MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!";
76       return RET_MEMORY_FAILED;
77     }
78     memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t));
79   }
80   return RET_OK;
81 }
82 
MallocWeightBiasData()83 int Convolution1x1FP16CPUKernel::MallocWeightBiasData() {
84   auto weight_tensor = in_tensors_.at(kWeightIndex);
85   auto input_channel = weight_tensor->Channel();
86   auto output_channel = weight_tensor->Batch();
87 
88   size_t size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float16_t);
89   if (!op_parameter_->is_train_session_) {
90     if (packed_weight_ == nullptr) {
91       packed_weight_ = malloc(size);
92       if (packed_weight_ == nullptr) {
93         MS_LOG(ERROR) << "Conv1x1 Malloc packed_weight_ error!";
94         return RET_ERROR;
95       }
96     }
97     memset(packed_weight_, 0, size);
98   }
99 
100   if (in_tensors_.size() == kInputSize2) {
101     size = UP_ROUND(output_channel, col_tile_) * sizeof(float16_t);
102     if (bias_data_ == nullptr) {
103       bias_data_ = malloc(size);
104       if (bias_data_ == nullptr) {
105         MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
106         return RET_ERROR;
107       }
108     }
109     memset(bias_data_, 0, size);
110   }
111   return RET_OK;
112 }
113 
PackWeight()114 void Convolution1x1FP16CPUKernel::PackWeight() {
115   auto weight_tensor = in_tensors_.at(kWeightIndex);
116   auto input_channel = weight_tensor->Channel();
117   auto output_channel = weight_tensor->Batch();
118   void *weight_origin = (op_parameter_->is_train_session_) ? weight_tensor->data() : origin_weight_;
119   MS_ASSERT(weight_origin != nullptr);
120 #ifdef ENABLE_ARM64
121   if (out_tensors_.front()->format() == NC4HW4) {
122     ColMajor2Row8MajorFp16(weight_origin, reinterpret_cast<float16_t *>(packed_weight_), input_channel, output_channel,
123                            true);
124   } else {
125     RowMajor2Col16MajorFp16Opt(static_cast<const float16_t *>(weight_origin),
126                                reinterpret_cast<float16_t *>(packed_weight_), output_channel, input_channel);
127   }
128 #else
129   ColMajor2Row8MajorFp16(weight_origin, reinterpret_cast<float16_t *>(packed_weight_), input_channel, output_channel,
130                          true);
131 #endif
132 }
133 
Init()134 int Convolution1x1FP16CPUKernel::Init() {
135   CHECK_LESS_RETURN(in_tensors_.size(), 2);
136   CHECK_LESS_RETURN(out_tensors_.size(), 1);
137   UpdateOriginWeightAndBias();
138 #ifdef ENABLE_ARM64
139   if (out_tensors_.front()->format() == NC4HW4) {
140     row_tile_ = C16NUM;
141     col_tile_ = C8NUM;
142   } else {
143     row_tile_ = C12NUM;
144     col_tile_ = C16NUM;
145   }
146 #else
147   row_tile_ = C12NUM;
148   col_tile_ = C8NUM;
149 #endif
150   if (op_parameter_->is_train_session_) {
151     auto weight_tensor = in_tensors_.at(kWeightIndex);
152     CHECK_NULL_RETURN(weight_tensor);
153     auto input_channel = weight_tensor->Channel();
154     auto output_channel = weight_tensor->Batch();
155     size_t size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float16_t);
156     set_workspace_size(size);
157   }
158   matmul_param_ = new (std::nothrow) MatMulParameter();
159   if (matmul_param_ == nullptr) {
160     MS_LOG(ERROR) << "Init matmul_param_ failed.";
161     return RET_ERROR;
162   }
163   int ret = InitConvWeightBias();
164   if (ret != RET_OK) {
165     MS_LOG(ERROR) << "Init weight bias failed.";
166     return ret;
167   }
168   return RET_OK;
169 }
170 
FreeTmpBuffer()171 void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {
172   if (pre_trans_input_ && input_ptr_ != nullptr) {
173     free(input_ptr_);
174     input_ptr_ = nullptr;
175   }
176   return;
177 }
178 
ReSize()179 int Convolution1x1FP16CPUKernel::ReSize() {
180   FreeTmpBuffer();
181   auto ret = ConvolutionBaseCPUKernel::Init();
182   if (ret != RET_OK) {
183     MS_LOG(ERROR) << "ConvolutionBase init failed.";
184     return ret;
185   }
186   ret = InitMatmulParam();
187   if (ret != RET_OK) {
188     MS_LOG(ERROR) << "Init matmul param failed.";
189     return ret;
190   }
191   ret = InitConv1x1Param();
192   if (ret != RET_OK) {
193     MS_LOG(ERROR) << "Init conv1x1 param failed.";
194     return ret;
195   }
196   return RET_OK;
197 }
198 
RunOc(int task_id)199 int Convolution1x1FP16CPUKernel::RunOc(int task_id) {
200   int cur_stride = matmul_param_->col_ - task_id * thread_stride_;
201   int cur_oc = MSMIN(thread_stride_, cur_stride);
202   if (cur_oc <= 0) {
203     return RET_OK;
204   }
205 
206   auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float16_t *>(bias_data_) + thread_stride_ * task_id;
207 
208   if (out_tensors_.front()->format() == NC4HW4) {
209     Conv1x1OutNc8hw8MultiThreadByWeightFp16(input_ptr_, pack_input_, reinterpret_cast<float16_t *>(packed_weight_),
210                                             reinterpret_cast<float16_t *>(bias_data_), output_ptr_, task_id,
211                                             matmul_param_);
212   } else {
213 #ifdef ENABLE_ARM64
214     MatMul12x16Fp16Opt(pack_input_,
215                        reinterpret_cast<float16_t *>(packed_weight_) + task_id * thread_stride_ * matmul_param_->deep_,
216                        output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
217                        matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
218 #else
219     MatMul12x8A32Fp16(pack_input_,
220                       reinterpret_cast<float16_t *>(packed_weight_) + task_id * thread_stride_ * matmul_param_->deep_,
221                       output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
222                       matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
223 #endif
224   }
225   return RET_OK;
226 }
227 
RunHw(int task_id)228 int Convolution1x1FP16CPUKernel::RunHw(int task_id) {
229   if (out_tensors_.front()->format() == NC4HW4) {
230     Conv1x1OutNc8hw8MultiThreadByInputFp16(input_ptr_, pack_input_, reinterpret_cast<float16_t *>(packed_weight_),
231                                            reinterpret_cast<float16_t *>(bias_data_), output_ptr_, task_id,
232                                            matmul_param_);
233   } else {
234     int res_stride = matmul_param_->row_ - task_id * thread_stride_;
235     int cur_hw_ = MSMIN(thread_stride_, res_stride);
236     if (cur_hw_ <= 0) {
237       return RET_OK;
238     }
239     float16_t *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_;
240     float16_t *thread_pack_input = pack_input_ + task_id * thread_stride_ * matmul_param_->deep_;
241     float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
242 
243     RowMajor2Col12MajorFp16Opt(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
244 #ifdef ENABLE_ARM64
245     MatMul12x16Fp16Opt(thread_pack_input, reinterpret_cast<float16_t *>(packed_weight_), thread_output_ptr,
246                        reinterpret_cast<float16_t *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_,
247                        cur_hw_, matmul_param_->col_, matmul_param_->col_, OutType_Nhwc);
248 #else
249     MatMul12x8A32Fp16(thread_pack_input, reinterpret_cast<float16_t *>(packed_weight_), thread_output_ptr,
250                       reinterpret_cast<float16_t *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_,
251                       cur_hw_, matmul_param_->col_, matmul_param_->col_, OutType_Nhwc);
252 #endif
253   }
254   return RET_OK;
255 }
256 
Convolution1x1Fp16RunOc(void * cdata,int task_id,float lhs_scale,float rhs_scale)257 static int Convolution1x1Fp16RunOc(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
258   auto conv = reinterpret_cast<Convolution1x1FP16CPUKernel *>(cdata);
259   auto error_code = conv->RunOc(task_id);
260   if (error_code != RET_OK) {
261     MS_LOG(ERROR) << "Convolution1x1 Fp16 Run error task_id[" << task_id << "] error_code[" << error_code << "]";
262     return RET_ERROR;
263   }
264   return RET_OK;
265 }
266 
Convolution1x1Fp16RunHw(void * cdata,int task_id,float lhs_scale,float rhs_scale)267 static int Convolution1x1Fp16RunHw(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
268   auto conv = reinterpret_cast<Convolution1x1FP16CPUKernel *>(cdata);
269   auto error_code = conv->RunHw(task_id);
270   if (error_code != RET_OK) {
271     MS_LOG(ERROR) << "Convolution1x1 Fp16 Run hw error task_id[" << task_id << "] error_code[" << error_code << "]";
272     return RET_ERROR;
273   }
274   return RET_OK;
275 }
276 
Run()277 int Convolution1x1FP16CPUKernel::Run() {
278   auto input_data = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data());
279   auto output_data = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data());
280   MS_ASSERT(input_data != nullptr);
281   MS_ASSERT(output_data != nullptr);
282   if (input_data == nullptr || output_data == nullptr) {
283     MS_LOG(ERROR) << "Convolution1x1 Fp16 get null tensor data!";
284     return RET_ERROR;
285   }
286   pack_input_ = reinterpret_cast<float16_t *>(
287     ctx_->allocator->Malloc(matmul_param_->row_align_ * matmul_param_->deep_ * sizeof(float16_t)));
288   if (pack_input_ == nullptr) {
289     MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
290     return RET_MEMORY_FAILED;
291   }
292   if (RepackWeight() != RET_OK) {
293     MS_LOG(ERROR) << "Repack weight failed.";
294     return RET_ERROR;
295   }
296 
297   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
298     output_ptr_ = output_data + batch_index * matmul_param_->row_ * matmul_param_->col_;
299     float16_t *batch_in =
300       input_data + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_;
301     if (pre_trans_input_) {
302       Conv1x1InputPack(batch_in, input_ptr_, conv_param_, sizeof(float16_t));
303     } else {
304       input_ptr_ = batch_in;
305     }
306 
307     int ret = RET_ERROR;
308     if (multi_thread_by_hw_) {
309       ret = ParallelLaunch(this->ms_context_, Convolution1x1Fp16RunHw, this, thread_count_);
310     } else {
311       if (out_tensors_.front()->format() == NC4HW4) {
312 #ifdef ENABLE_ARM64
313         RowMajor2Col16MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
314 #else
315         RowMajor2Col12MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
316 #endif
317       } else {
318         RowMajor2Col12MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
319       }
320       ret = ParallelLaunch(this->ms_context_, Convolution1x1Fp16RunOc, this, thread_count_);
321     }
322     if (ret != RET_OK) {
323       MS_LOG(ERROR) << "ParallelLaunch failed.";
324       ctx_->allocator->Free(pack_input_);
325       pack_input_ = nullptr;
326       return ret;
327     }
328   }
329   ctx_->allocator->Free(pack_input_);
330   pack_input_ = nullptr;
331   return RET_OK;
332 }
333 }  // namespace mindspore::kernel
334