• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/runtime/kernel/arm/fp16/deconvolution_fp16.h"
18 #include "src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h"
19 #include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h"
20 
21 using mindspore::kernel::KERNEL_ARCH;
22 using mindspore::lite::KernelRegistrar;
23 using mindspore::lite::RET_ERROR;
24 using mindspore::lite::RET_NULL_PTR;
25 using mindspore::lite::RET_OK;
26 using mindspore::schema::PrimitiveType_Conv2dTransposeFusion;
27 
28 namespace mindspore::kernel {
~DeConvolutionFp16CPUKernel()29 DeConvolutionFp16CPUKernel::~DeConvolutionFp16CPUKernel() {
30   if (matmul_param_ != nullptr) {
31     delete matmul_param_;
32     matmul_param_ = nullptr;
33   }
34   return;
35 }
36 
ReSize()37 int DeConvolutionFp16CPUKernel::ReSize() {
38   CHECK_LESS_RETURN(in_tensors_.size(), 1);
39   CHECK_LESS_RETURN(out_tensors_.size(), 1);
40   CHECK_NULL_RETURN(conv_param_);
41   CHECK_NULL_RETURN(matmul_param_);
42 
43   auto ret = ConvolutionBaseCPUKernel::Init();
44   if (ret != RET_OK) {
45     MS_LOG(ERROR) << "ConvolutionBaseCPUKernel Init error!";
46     return ret;
47   }
48   int error_code = InitParam();
49   if (error_code != RET_OK) {
50     MS_LOG(ERROR) << "deconv InitParam error!";
51     return error_code;
52   }
53   return RET_OK;
54 }
55 
PackWeight()56 void DeConvolutionFp16CPUKernel::PackWeight() {
57   auto weight_tensor = in_tensors_.at(kWeightIndex);
58   auto input_channel = weight_tensor->Batch();
59   auto output_channel = weight_tensor->Channel();
60   auto kernel_h = weight_tensor->Height();
61   auto kernel_w = weight_tensor->Width();
62   void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data() : origin_weight_;
63   MS_ASSERT(origin_weight != nullptr);
64   PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(origin_weight), reinterpret_cast<float16_t *>(packed_weight_),
65                            input_channel, kernel_w * kernel_h, output_channel);
66 }
67 
MallocWeightBiasData()68 int DeConvolutionFp16CPUKernel::MallocWeightBiasData() {
69   auto weight_tensor = in_tensors_.at(kWeightIndex);
70   auto input_channel = weight_tensor->Batch();
71   auto output_channel = weight_tensor->Channel();
72   auto kernel_h = weight_tensor->Height();
73   auto kernel_w = weight_tensor->Width();
74   size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
75   if (!op_parameter_->is_train_session_) {
76     packed_weight_ = malloc(weight_pack_size);
77     if (packed_weight_ == nullptr) {
78       MS_LOG(ERROR) << "deconv malloc packed_weight_ error!";
79       return RET_ERROR;
80     }
81     memset(packed_weight_, 0, weight_pack_size);
82   }
83   auto bias_size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
84   bias_data_ = malloc(bias_size);
85   if (bias_data_ == nullptr) {
86     MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
87     return RET_ERROR;
88   }
89   memset(bias_data_, 0, UP_ROUND(output_channel, C8NUM) * sizeof(float16_t));
90   return RET_OK;
91 }
92 
InitParam()93 int DeConvolutionFp16CPUKernel::InitParam() {
94   input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
95   kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_;
96   output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;
97 
98   matmul_param_->row_ = input_plane_;
99   matmul_param_->deep_ = conv_param_->input_channel_;
100   matmul_param_->col_ = conv_param_->output_channel_ * kernel_plane_;
101   matmul_param_->row_16_ = UP_ROUND(matmul_param_->row_, C16NUM);
102   matmul_param_->col_8_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * kernel_plane_;
103 
104   thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM));
105   NNACL_CHECK_ZERO_RETURN_ERR(thread_count_);
106   thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_);
107   return RET_OK;
108 }
109 
InitRunBuf()110 int DeConvolutionFp16CPUKernel::InitRunBuf() {
111   pack_output_ = reinterpret_cast<float16_t *>(
112     ctx_->allocator->Malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float16_t)));
113   if (pack_output_ == nullptr) {
114     MS_LOG(ERROR) << "deconv Malloc pack_output_ error!";
115     return RET_NULL_PTR;
116   }
117 
118   tmp_buffer_ = reinterpret_cast<float16_t *>(
119     ctx_->allocator->Malloc(matmul_param_->row_16_ * matmul_param_->col_8_ * sizeof(float16_t)));
120   if (tmp_buffer_ == nullptr) {
121     MS_LOG(ERROR) << "deconv Malloc tmp_buffer_ error!";
122     return RET_ERROR;
123   }
124 
125   pack_input_ =
126     reinterpret_cast<float16_t *>(malloc(matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t)));
127   if (pack_input_ == nullptr) {
128     MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
129     return RET_ERROR;
130   }
131   return RET_OK;
132 }
133 
FreeRunBuf()134 void DeConvolutionFp16CPUKernel::FreeRunBuf() {
135   if (tmp_buffer_ != nullptr) {
136     ctx_->allocator->Free(tmp_buffer_);
137     tmp_buffer_ = nullptr;
138   }
139   if (pack_output_ != nullptr) {
140     ctx_->allocator->Free(pack_output_);
141     pack_output_ = nullptr;
142   }
143   if (pack_input_ != nullptr) {
144     ctx_->allocator->Free(pack_input_);
145     pack_input_ = nullptr;
146   }
147   return;
148 }
149 
DeConvFp16Run(void * cdata,int task_id,float lhs_scale,float rhs_scale)150 static int DeConvFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
151   auto deconv = reinterpret_cast<DeConvolutionFp16CPUKernel *>(cdata);
152   auto error_code = deconv->DoDeconv(task_id);
153   if (error_code != RET_OK) {
154     MS_LOG(ERROR) << "DeConvFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]";
155     return RET_ERROR;
156   }
157   return RET_OK;
158 }
159 
DoDeconv(int task_id)160 int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
161   int cur_stride = UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_;
162   int oc = MSMIN(thread_stride_, cur_stride);
163   cur_stride = conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM;
164   int oc_res = MSMIN(thread_stride_ * C8NUM, cur_stride);
165   if (oc <= 0) {
166     return RET_OK;
167   }
168 
169   auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_;
170   MatMulFp16(pack_input_,
171              reinterpret_cast<float16_t *>(packed_weight_) +
172                task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
173              tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
174              OutType_C8);
175 
176   DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
177                  reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM,
178                  batch_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
179   return RET_OK;
180 }
181 
Init()182 int DeConvolutionFp16CPUKernel::Init() {
183   CHECK_LESS_RETURN(in_tensors_.size(), 2);
184   CHECK_LESS_RETURN(out_tensors_.size(), 1);
185   CHECK_NULL_RETURN(conv_param_);
186   CHECK_NULL_RETURN(in_tensors_.at(kInputIndex));
187   CHECK_NULL_RETURN(in_tensors_.at(kWeightIndex));
188   UpdateOriginWeightAndBias();
189 
190   if (op_parameter_->is_train_session_) {
191     auto weight_tensor = in_tensors_.at(kWeightIndex);
192     auto input_channel = weight_tensor->Batch();
193     auto output_channel = weight_tensor->Channel();
194     auto kernel_h = weight_tensor->Height();
195     auto kernel_w = weight_tensor->Width();
196     size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
197     set_workspace_size(weight_pack_size);
198   }
199   matmul_param_ = new (std::nothrow) MatMulParameter();
200   if (matmul_param_ == nullptr) {
201     MS_LOG(ERROR) << "Memory allocation failed";
202     return RET_ERROR;
203   }
204   int ret = InitConvWeightBias();
205   if (ret != RET_OK) {
206     MS_LOG(ERROR) << "deconv InitConvWeightBias error!";
207     return ret;
208   }
209   if (!InferShapeDone()) {
210     return RET_OK;
211   }
212   return ReSize();
213 }
214 
Run()215 int DeConvolutionFp16CPUKernel::Run() {
216   if (RepackWeight() != RET_OK) {
217     MS_LOG(ERROR) << "Repack weight failed.";
218     return RET_ERROR;
219   }
220 
221   auto input_tensor = in_tensors_.at(kInputIndex);
222   auto output_tensor = out_tensors_.at(kOutputIndex);
223   auto *input_ptr = reinterpret_cast<float16_t *>(input_tensor->data());
224   auto *output_ptr = reinterpret_cast<float16_t *>(output_tensor->data());
225   CHECK_NULL_RETURN(input_ptr);
226   CHECK_NULL_RETURN(output_ptr);
227 
228   int error_code = InitRunBuf();
229   if (error_code != RET_OK) {
230     MS_LOG(ERROR) << "deconv fp16 InitRunBuf error! error_code[" << error_code << "]";
231     FreeRunBuf();
232     return RET_ERROR;
233   }
234 
235   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
236     batch_input_ = input_ptr + batch_index * conv_param_->input_channel_ * input_plane_;
237     batch_output_ = output_ptr + batch_index * conv_param_->output_channel_ * output_plane_;
238 
239     RowMajor2Col16MajorFp16Opt(batch_input_, pack_input_, input_plane_, conv_param_->input_channel_);
240 
241     error_code = ParallelLaunch(this->ms_context_, DeConvFp16Run, this, thread_count_);
242     if (error_code != RET_OK) {
243       MS_LOG(ERROR) << "deconv fp16 run error! error_code[" << error_code << "]";
244       FreeRunBuf();
245       return error_code;
246     }
247   }
248 
249   FreeRunBuf();
250   return error_code;
251 }
252 
CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor * > & inputs,const std::vector<lite::Tensor * > & outputs,OpParameter * op_parameter,const lite::Context * ctx,const kernel::KernelKey & desc)253 kernel::InnerKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
254                                                 const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
255                                                 const lite::Context *ctx, const kernel::KernelKey &desc) {
256   MS_CHECK_TRUE_RET(op_parameter != nullptr, nullptr);
257   MS_CHECK_TRUE_RET(ctx != nullptr, nullptr);
258   MS_ASSERT(desc.type == schema::PrimitiveType_Conv2dTransposeFusion);
259 
260   kernel::InnerKernel *kernel = nullptr;
261   auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
262   if (conv_param->group_ == 1) {
263     if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
264         (conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1)) {
265       kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs,
266                                                                       static_cast<const lite::InnerContext *>(ctx));
267     } else {
268       kernel = new (std::nothrow)
269         kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
270     }
271   } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
272     kernel = new (std::nothrow)
273       DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
274   }
275 
276   if (kernel == nullptr) {
277     MS_LOG(ERROR) << "kernel is nullptr.";
278     free(op_parameter);
279     return nullptr;
280   }
281   return kernel;
282 }
283 
284 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2dTransposeFusion, CpuDeConvFp16KernelCreator)
285 }  // namespace mindspore::kernel
286