• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/runtime/kernel/arm/fp32/deconvolution_fp32.h"
18 #include "src/runtime/kernel/arm/fp32/deconvolution_winograd_fp32.h"
19 #include "src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.h"
20 
21 using mindspore::kernel::KERNEL_ARCH;
22 using mindspore::lite::KernelRegistrar;
23 using mindspore::lite::RET_ERROR;
24 using mindspore::lite::RET_NULL_PTR;
25 using mindspore::lite::RET_OK;
26 using mindspore::schema::PrimitiveType_Conv2dTransposeFusion;
27 
28 namespace mindspore::kernel {
~DeConvolutionCPUKernel()29 DeConvolutionCPUKernel::~DeConvolutionCPUKernel() {
30   if (matmul_param_ != nullptr) {
31     delete matmul_param_;
32     matmul_param_ = nullptr;
33   }
34 }
35 
ReSize()36 int DeConvolutionCPUKernel::ReSize() {
37   CHECK_LESS_RETURN(in_tensors_.size(), 1);
38   CHECK_LESS_RETURN(out_tensors_.size(), 1);
39   CHECK_NULL_RETURN(conv_param_);
40   CHECK_NULL_RETURN(matmul_param_);
41 
42   auto ret = ConvolutionBaseCPUKernel::Init();
43   if (ret != RET_OK) {
44     MS_LOG(ERROR) << "ConvolutionBaseCPUKernel init error!";
45     return ret;
46   }
47 
48   int error_code = InitParam();
49   if (error_code != RET_OK) {
50     MS_LOG(ERROR) << "deconv InitParam error!ret: " << error_code;
51     return error_code;
52   }
53   return RET_OK;
54 }
55 
MallocWeightBiasData()56 int DeConvolutionCPUKernel::MallocWeightBiasData() {
57   auto weight_tensor = in_tensors_.at(kWeightIndex);
58   auto input_channel = weight_tensor->Batch();
59   auto output_channel = weight_tensor->Channel();
60   auto kernel_h_ = weight_tensor->Height();
61   auto kernel_w_ = weight_tensor->Width();
62   int output_aligned_size = UP_ROUND(output_channel, C8NUM);
63   size_t pack_weight_size = input_channel * kernel_w_ * kernel_h_ * output_aligned_size * sizeof(float);
64   if (!op_parameter_->is_train_session_) {
65     packed_weight_ = MallocAlignedData(C32NUM, pack_weight_size);
66     if (packed_weight_ == nullptr) {
67       MS_LOG(ERROR) << "deconv malloc packed_weight_ error!";
68       return RET_ERROR;
69     }
70   }
71 
72   bias_data_ = MallocAlignedData(C32NUM, output_aligned_size * sizeof(float));
73   if (bias_data_ == nullptr) {
74     MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
75     return RET_ERROR;
76   }
77   memset(bias_data_, 0, output_aligned_size * sizeof(float));
78   return RET_OK;
79 }
80 
PackWeight()81 void DeConvolutionCPUKernel::PackWeight() {
82   auto weight_tensor = in_tensors_.at(kWeightIndex);
83   auto input_channel = weight_tensor->Batch();
84   auto output_channel = weight_tensor->Channel();
85   auto kernel_h = weight_tensor->Height();
86   auto kernel_w = weight_tensor->Width();
87   void *origin_weight = IsTrainable() ? weight_tensor->data() : origin_weight_;
88   MS_ASSERT(origin_weight != nullptr);
89 #ifdef ENABLE_AVX
90   PackNHWCToCXHWNXFp32(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_),
91                        input_channel, kernel_w * kernel_h, output_channel);
92 #else
93   PackNHWCToC8HWN8Fp32(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_),
94                        input_channel, kernel_w * kernel_h, output_channel);
95 #endif
96 }
97 
InitParam()98 int DeConvolutionCPUKernel::InitParam() {
99   input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
100   kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_;
101   output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;
102 
103   matmul_param_->row_ = input_plane_;
104   matmul_param_->deep_ = conv_param_->input_channel_;
105   matmul_param_->col_ = conv_param_->output_channel_ * kernel_plane_;
106   matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
107   matmul_param_->col_8_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * kernel_plane_;
108 
109   thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM));
110   NNACL_CHECK_ZERO_RETURN_ERR(thread_count_);
111 
112 #ifdef ENABLE_AVX
113   thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM * C3NUM), thread_count_) * C3NUM;
114 #else
115   thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_);
116 #endif
117   return RET_OK;
118 }
119 
DeConvFp32Run(void * cdata,int task_id,float lhs_scale,float rhs_scale)120 int DeConvFp32Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
121   auto deconv = reinterpret_cast<DeConvolutionCPUKernel *>(cdata);
122   auto error_code = deconv->DoDeconv(task_id);
123   if (error_code != RET_OK) {
124     MS_LOG(ERROR) << "DeConvFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
125     return RET_ERROR;
126   }
127   return RET_OK;
128 }
129 
DoDeconv(int task_id)130 int DeConvolutionCPUKernel::DoDeconv(int task_id) {
131   int res_stride = UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_;
132   int oc = MSMIN(thread_stride_, res_stride);
133   int cur_stride = thread_stride_ * C8NUM;
134   res_stride = conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM;
135   int oc_res = MSMIN(cur_stride, res_stride);
136   if (oc <= 0 || oc_res <= 0) {
137     return RET_OK;
138   }
139   auto tmp_buffer = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_align_;
140 #ifdef ENABLE_AVX
141   DeconvMatmulAvx(
142     pack_input_,
143     reinterpret_cast<float *>(packed_weight_) + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
144     tmp_buffer, matmul_param_->deep_, matmul_param_->row_align_, oc * C8NUM * kernel_plane_, kernel_plane_);
145 #elif ENABLE_SSE
146   DeconvMatmulFloatSse(
147     pack_input_,
148     reinterpret_cast<float *>(packed_weight_) + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
149     tmp_buffer, matmul_param_->deep_, matmul_param_->row_align_, oc * C8NUM * kernel_plane_);
150 #else
151   MatMulOpt(
152     pack_input_,
153     reinterpret_cast<float *>(packed_weight_) + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
154     tmp_buffer, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_align_, oc * C8NUM * kernel_plane_,
155     matmul_param_->col_, OutType_C8);
156 #endif
157 
158   DeConvPostFp32C8(tmp_buffer, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
159                    reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id * C8NUM,
160                    output_ptr_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
161   return RET_OK;
162 }
163 
Init()164 int DeConvolutionCPUKernel::Init() {
165   CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
166   CHECK_LESS_RETURN(out_tensors_.size(), 1);
167   CHECK_NULL_RETURN(conv_param_);
168   CHECK_NULL_RETURN(in_tensors_.at(kInputIndex));
169   CHECK_NULL_RETURN(in_tensors_.at(kWeightIndex));
170 
171 #if defined(ENABLE_ARM32) || defined(ENABLE_AVX) || defined(ENABLE_SSE)
172   row_tile_ = C4NUM;
173 #else
174   row_tile_ = C12NUM;
175 #endif
176   if (op_parameter_->is_train_session_) {
177     auto weight_tensor = in_tensors_.at(kWeightIndex);
178     auto input_channel = weight_tensor->Batch();
179     auto output_channel = weight_tensor->Channel();
180     auto kernel_h_ = weight_tensor->Height();
181     auto kernel_w_ = weight_tensor->Width();
182     int output_aligned_size = UP_ROUND(output_channel, C8NUM);
183     size_t pack_weight_size = input_channel * kernel_w_ * kernel_h_ * output_aligned_size * sizeof(float);
184     set_workspace_size(pack_weight_size);
185   }
186   matmul_param_ = new (std::nothrow) MatMulParameter();
187   if (matmul_param_ == nullptr) {
188     MS_LOG(ERROR) << "Memory allocation failed";
189     return RET_ERROR;
190   }
191   if (in_tensors_.at(kWeightIndex)->data() != nullptr) {
192     int error_code = InitConvWeightBias();
193     if (error_code != RET_OK) {
194       MS_LOG(ERROR) << "deconv InitConvWeightBias error!ret: " << error_code;
195       return error_code;
196     }
197   } else {
198     is_repack_ = true;
199     MS_LOG(WARNING) << "The weight is nullptr, will pack in runtime.";
200   }
201   if (!InferShapeDone()) {
202     return RET_OK;
203   }
204   return ReSize();
205 }
206 
FreeRunBuf()207 void DeConvolutionCPUKernel::FreeRunBuf() {
208   if (pack_output_ != nullptr) {
209     ctx_->allocator->Free(pack_output_);
210     pack_output_ = nullptr;
211   }
212   if (tmp_buffer_ != nullptr) {
213     ctx_->allocator->Free(tmp_buffer_);
214     tmp_buffer_ = nullptr;
215   }
216   if (pack_input_ != nullptr) {
217     ctx_->allocator->Free(pack_input_);
218     pack_input_ = nullptr;
219   }
220 }
221 
InitRunBuf()222 int DeConvolutionCPUKernel::InitRunBuf() {
223   pack_output_ = reinterpret_cast<float *>(
224     ctx_->allocator->Malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float)));
225   if (pack_output_ == nullptr) {
226     MS_LOG(ERROR) << "deconv Malloc pack_output_ error!";
227     return RET_NULL_PTR;
228   }
229 
230   tmp_buffer_ = reinterpret_cast<float *>(
231     ctx_->allocator->Malloc(matmul_param_->row_align_ * matmul_param_->col_8_ * sizeof(float)));
232   if (tmp_buffer_ == nullptr) {
233     MS_LOG(ERROR) << "Conv1x1 Malloc tmp_buffer_ error!";
234     return RET_NULL_PTR;
235   }
236 
237   pack_input_ = reinterpret_cast<float *>(
238     ctx_->allocator->Malloc(matmul_param_->row_align_ * matmul_param_->deep_ * sizeof(float)));
239   if (pack_input_ == nullptr) {
240     MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
241     return RET_ERROR;
242   }
243   return RET_OK;
244 }
245 
Run()246 int DeConvolutionCPUKernel::Run() {
247   if (RepackWeight() != RET_OK) {
248     MS_LOG(ERROR) << "Repack weight failed.";
249     return RET_ERROR;
250   }
251 
252   auto input_tensor = in_tensors_.at(kInputIndex);
253   auto output_tensor = out_tensors_.at(kOutputIndex);
254   float *src_in = reinterpret_cast<float *>(input_tensor->data());
255   float *src_out = reinterpret_cast<float *>(output_tensor->data());
256   CHECK_NULL_RETURN(src_in);
257   CHECK_NULL_RETURN(src_out);
258 
259   int error_code = InitRunBuf();
260   if (error_code != RET_OK) {
261     MS_LOG(ERROR) << "deconv fp32 InitRunBuf error! error_code[" << error_code << "]";
262     FreeRunBuf();
263     return error_code;
264   }
265 
266   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
267     input_ptr_ = src_in + batch_index * input_plane_ * conv_param_->input_channel_;
268     output_ptr_ = src_out + batch_index * output_plane_ * conv_param_->output_channel_;
269 
270 #if defined(ENABLE_ARM32) || defined(ENABLE_SSE)
271     RowMajor2Col4Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
272 #else
273     RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
274 #endif
275 
276     error_code = ParallelLaunch(this->ms_context_, DeConvFp32Run, this, thread_count_);
277     if (error_code != RET_OK) {
278       MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
279       FreeRunBuf();
280       return error_code;
281     }
282   }
283 
284   FreeRunBuf();
285   return RET_OK;
286 }
287 
CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor * > & inputs,const std::vector<lite::Tensor * > & outputs,OpParameter * op_parameter,const lite::Context * ctx,const kernel::KernelKey & desc)288 kernel::InnerKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
289                                                 const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
290                                                 const lite::Context *ctx, const kernel::KernelKey &desc) {
291   MS_CHECK_TRUE_RET(op_parameter != nullptr, nullptr);
292   MS_CHECK_TRUE_RET(ctx != nullptr, nullptr);
293 
294   MS_ASSERT(desc.type == schema::PrimitiveType_Conv2dTransposeFusion);
295 
296   auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
297   kernel::InnerKernel *kernel = nullptr;
298   if (conv_param->group_ == 1) {
299 #ifdef ENABLE_AVX
300     if ((conv_param->stride_h_ > 1 || conv_param->stride_w_ > 1) &&
301         (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1) &&
302         (conv_param->kernel_w_ / conv_param->stride_w_ >= 2 || conv_param->kernel_h_ / conv_param->stride_h_ >= 2 ||
303          conv_param->output_channel_ == 1) &&
304         conv_param->input_w_ * conv_param->input_h_ >= 2000) {
305       // output_channel_ = 1 is not appropriate in gemm deconv in x86
306       kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(op_parameter, inputs, outputs,
307                                                                          static_cast<const lite::InnerContext *>(ctx));
308     } else {
309       kernel = new (std::nothrow)
310         kernel::DeConvolutionCPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
311     }
312 #else
313     if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
314         (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
315       kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(op_parameter, inputs, outputs,
316                                                                          static_cast<const lite::InnerContext *>(ctx));
317     } else {
318       kernel = new (std::nothrow)
319         kernel::DeConvolutionCPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
320     }
321 #endif
322   } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
323     kernel = new (std::nothrow) kernel::DeconvolutionDepthwiseCPUKernel(op_parameter, inputs, outputs,
324                                                                         static_cast<const lite::InnerContext *>(ctx));
325   } else {
326     MS_LOG(ERROR) << "deconv do not support group deconv!";
327     kernel = nullptr;
328   }
329 
330   if (kernel == nullptr) {
331     MS_LOG(ERROR) << "kernel is nullptr.";
332     free(op_parameter);
333     return nullptr;
334   }
335   return kernel;
336 }
337 
338 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2dTransposeFusion, CpuDeConvFp32KernelCreator)
339 }  // namespace mindspore::kernel
340