1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/runtime/kernel/arm/fp32/deconvolution_fp32.h"
18 #include "src/runtime/kernel/arm/fp32/deconvolution_winograd_fp32.h"
19 #include "src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.h"
20
21 using mindspore::kernel::KERNEL_ARCH;
22 using mindspore::lite::KernelRegistrar;
23 using mindspore::lite::RET_ERROR;
24 using mindspore::lite::RET_NULL_PTR;
25 using mindspore::lite::RET_OK;
26 using mindspore::schema::PrimitiveType_Conv2dTransposeFusion;
27
28 namespace mindspore::kernel {
~DeConvolutionCPUKernel()29 DeConvolutionCPUKernel::~DeConvolutionCPUKernel() {
30 if (matmul_param_ != nullptr) {
31 delete matmul_param_;
32 matmul_param_ = nullptr;
33 }
34 }
35
ReSize()36 int DeConvolutionCPUKernel::ReSize() {
37 CHECK_LESS_RETURN(in_tensors_.size(), 1);
38 CHECK_LESS_RETURN(out_tensors_.size(), 1);
39 CHECK_NULL_RETURN(conv_param_);
40 CHECK_NULL_RETURN(matmul_param_);
41
42 auto ret = ConvolutionBaseCPUKernel::Init();
43 if (ret != RET_OK) {
44 MS_LOG(ERROR) << "ConvolutionBaseCPUKernel init error!";
45 return ret;
46 }
47
48 int error_code = InitParam();
49 if (error_code != RET_OK) {
50 MS_LOG(ERROR) << "deconv InitParam error!ret: " << error_code;
51 return error_code;
52 }
53 return RET_OK;
54 }
55
MallocWeightBiasData()56 int DeConvolutionCPUKernel::MallocWeightBiasData() {
57 auto weight_tensor = in_tensors_.at(kWeightIndex);
58 auto input_channel = weight_tensor->Batch();
59 auto output_channel = weight_tensor->Channel();
60 auto kernel_h_ = weight_tensor->Height();
61 auto kernel_w_ = weight_tensor->Width();
62 int output_aligned_size = UP_ROUND(output_channel, C8NUM);
63 size_t pack_weight_size = input_channel * kernel_w_ * kernel_h_ * output_aligned_size * sizeof(float);
64 if (!op_parameter_->is_train_session_) {
65 packed_weight_ = MallocAlignedData(C32NUM, pack_weight_size);
66 if (packed_weight_ == nullptr) {
67 MS_LOG(ERROR) << "deconv malloc packed_weight_ error!";
68 return RET_ERROR;
69 }
70 }
71
72 bias_data_ = MallocAlignedData(C32NUM, output_aligned_size * sizeof(float));
73 if (bias_data_ == nullptr) {
74 MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
75 return RET_ERROR;
76 }
77 memset(bias_data_, 0, output_aligned_size * sizeof(float));
78 return RET_OK;
79 }
80
PackWeight()81 void DeConvolutionCPUKernel::PackWeight() {
82 auto weight_tensor = in_tensors_.at(kWeightIndex);
83 auto input_channel = weight_tensor->Batch();
84 auto output_channel = weight_tensor->Channel();
85 auto kernel_h = weight_tensor->Height();
86 auto kernel_w = weight_tensor->Width();
87 void *origin_weight = IsTrainable() ? weight_tensor->data() : origin_weight_;
88 MS_ASSERT(origin_weight != nullptr);
89 #ifdef ENABLE_AVX
90 PackNHWCToCXHWNXFp32(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_),
91 input_channel, kernel_w * kernel_h, output_channel);
92 #else
93 PackNHWCToC8HWN8Fp32(reinterpret_cast<float *>(origin_weight), reinterpret_cast<float *>(packed_weight_),
94 input_channel, kernel_w * kernel_h, output_channel);
95 #endif
96 }
97
InitParam()98 int DeConvolutionCPUKernel::InitParam() {
99 input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
100 kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_;
101 output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;
102
103 matmul_param_->row_ = input_plane_;
104 matmul_param_->deep_ = conv_param_->input_channel_;
105 matmul_param_->col_ = conv_param_->output_channel_ * kernel_plane_;
106 matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
107 matmul_param_->col_8_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * kernel_plane_;
108
109 thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM));
110 NNACL_CHECK_ZERO_RETURN_ERR(thread_count_);
111
112 #ifdef ENABLE_AVX
113 thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM * C3NUM), thread_count_) * C3NUM;
114 #else
115 thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_);
116 #endif
117 return RET_OK;
118 }
119
DeConvFp32Run(void * cdata,int task_id,float lhs_scale,float rhs_scale)120 int DeConvFp32Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
121 auto deconv = reinterpret_cast<DeConvolutionCPUKernel *>(cdata);
122 auto error_code = deconv->DoDeconv(task_id);
123 if (error_code != RET_OK) {
124 MS_LOG(ERROR) << "DeConvFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
125 return RET_ERROR;
126 }
127 return RET_OK;
128 }
129
DoDeconv(int task_id)130 int DeConvolutionCPUKernel::DoDeconv(int task_id) {
131 int res_stride = UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_;
132 int oc = MSMIN(thread_stride_, res_stride);
133 int cur_stride = thread_stride_ * C8NUM;
134 res_stride = conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM;
135 int oc_res = MSMIN(cur_stride, res_stride);
136 if (oc <= 0 || oc_res <= 0) {
137 return RET_OK;
138 }
139 auto tmp_buffer = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_align_;
140 #ifdef ENABLE_AVX
141 DeconvMatmulAvx(
142 pack_input_,
143 reinterpret_cast<float *>(packed_weight_) + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
144 tmp_buffer, matmul_param_->deep_, matmul_param_->row_align_, oc * C8NUM * kernel_plane_, kernel_plane_);
145 #elif ENABLE_SSE
146 DeconvMatmulFloatSse(
147 pack_input_,
148 reinterpret_cast<float *>(packed_weight_) + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
149 tmp_buffer, matmul_param_->deep_, matmul_param_->row_align_, oc * C8NUM * kernel_plane_);
150 #else
151 MatMulOpt(
152 pack_input_,
153 reinterpret_cast<float *>(packed_weight_) + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
154 tmp_buffer, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_align_, oc * C8NUM * kernel_plane_,
155 matmul_param_->col_, OutType_C8);
156 #endif
157
158 DeConvPostFp32C8(tmp_buffer, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
159 reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id * C8NUM,
160 output_ptr_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
161 return RET_OK;
162 }
163
Init()164 int DeConvolutionCPUKernel::Init() {
165 CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
166 CHECK_LESS_RETURN(out_tensors_.size(), 1);
167 CHECK_NULL_RETURN(conv_param_);
168 CHECK_NULL_RETURN(in_tensors_.at(kInputIndex));
169 CHECK_NULL_RETURN(in_tensors_.at(kWeightIndex));
170
171 #if defined(ENABLE_ARM32) || defined(ENABLE_AVX) || defined(ENABLE_SSE)
172 row_tile_ = C4NUM;
173 #else
174 row_tile_ = C12NUM;
175 #endif
176 if (op_parameter_->is_train_session_) {
177 auto weight_tensor = in_tensors_.at(kWeightIndex);
178 auto input_channel = weight_tensor->Batch();
179 auto output_channel = weight_tensor->Channel();
180 auto kernel_h_ = weight_tensor->Height();
181 auto kernel_w_ = weight_tensor->Width();
182 int output_aligned_size = UP_ROUND(output_channel, C8NUM);
183 size_t pack_weight_size = input_channel * kernel_w_ * kernel_h_ * output_aligned_size * sizeof(float);
184 set_workspace_size(pack_weight_size);
185 }
186 matmul_param_ = new (std::nothrow) MatMulParameter();
187 if (matmul_param_ == nullptr) {
188 MS_LOG(ERROR) << "Memory allocation failed";
189 return RET_ERROR;
190 }
191 if (in_tensors_.at(kWeightIndex)->data() != nullptr) {
192 int error_code = InitConvWeightBias();
193 if (error_code != RET_OK) {
194 MS_LOG(ERROR) << "deconv InitConvWeightBias error!ret: " << error_code;
195 return error_code;
196 }
197 } else {
198 is_repack_ = true;
199 MS_LOG(WARNING) << "The weight is nullptr, will pack in runtime.";
200 }
201 if (!InferShapeDone()) {
202 return RET_OK;
203 }
204 return ReSize();
205 }
206
FreeRunBuf()207 void DeConvolutionCPUKernel::FreeRunBuf() {
208 if (pack_output_ != nullptr) {
209 ctx_->allocator->Free(pack_output_);
210 pack_output_ = nullptr;
211 }
212 if (tmp_buffer_ != nullptr) {
213 ctx_->allocator->Free(tmp_buffer_);
214 tmp_buffer_ = nullptr;
215 }
216 if (pack_input_ != nullptr) {
217 ctx_->allocator->Free(pack_input_);
218 pack_input_ = nullptr;
219 }
220 }
221
InitRunBuf()222 int DeConvolutionCPUKernel::InitRunBuf() {
223 pack_output_ = reinterpret_cast<float *>(
224 ctx_->allocator->Malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float)));
225 if (pack_output_ == nullptr) {
226 MS_LOG(ERROR) << "deconv Malloc pack_output_ error!";
227 return RET_NULL_PTR;
228 }
229
230 tmp_buffer_ = reinterpret_cast<float *>(
231 ctx_->allocator->Malloc(matmul_param_->row_align_ * matmul_param_->col_8_ * sizeof(float)));
232 if (tmp_buffer_ == nullptr) {
233 MS_LOG(ERROR) << "Conv1x1 Malloc tmp_buffer_ error!";
234 return RET_NULL_PTR;
235 }
236
237 pack_input_ = reinterpret_cast<float *>(
238 ctx_->allocator->Malloc(matmul_param_->row_align_ * matmul_param_->deep_ * sizeof(float)));
239 if (pack_input_ == nullptr) {
240 MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
241 return RET_ERROR;
242 }
243 return RET_OK;
244 }
245
Run()246 int DeConvolutionCPUKernel::Run() {
247 if (RepackWeight() != RET_OK) {
248 MS_LOG(ERROR) << "Repack weight failed.";
249 return RET_ERROR;
250 }
251
252 auto input_tensor = in_tensors_.at(kInputIndex);
253 auto output_tensor = out_tensors_.at(kOutputIndex);
254 float *src_in = reinterpret_cast<float *>(input_tensor->data());
255 float *src_out = reinterpret_cast<float *>(output_tensor->data());
256 CHECK_NULL_RETURN(src_in);
257 CHECK_NULL_RETURN(src_out);
258
259 int error_code = InitRunBuf();
260 if (error_code != RET_OK) {
261 MS_LOG(ERROR) << "deconv fp32 InitRunBuf error! error_code[" << error_code << "]";
262 FreeRunBuf();
263 return error_code;
264 }
265
266 for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
267 input_ptr_ = src_in + batch_index * input_plane_ * conv_param_->input_channel_;
268 output_ptr_ = src_out + batch_index * output_plane_ * conv_param_->output_channel_;
269
270 #if defined(ENABLE_ARM32) || defined(ENABLE_SSE)
271 RowMajor2Col4Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
272 #else
273 RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
274 #endif
275
276 error_code = ParallelLaunch(this->ms_context_, DeConvFp32Run, this, thread_count_);
277 if (error_code != RET_OK) {
278 MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
279 FreeRunBuf();
280 return error_code;
281 }
282 }
283
284 FreeRunBuf();
285 return RET_OK;
286 }
287
CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor * > & inputs,const std::vector<lite::Tensor * > & outputs,OpParameter * op_parameter,const lite::Context * ctx,const kernel::KernelKey & desc)288 kernel::InnerKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
289 const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
290 const lite::Context *ctx, const kernel::KernelKey &desc) {
291 MS_CHECK_TRUE_RET(op_parameter != nullptr, nullptr);
292 MS_CHECK_TRUE_RET(ctx != nullptr, nullptr);
293
294 MS_ASSERT(desc.type == schema::PrimitiveType_Conv2dTransposeFusion);
295
296 auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
297 kernel::InnerKernel *kernel = nullptr;
298 if (conv_param->group_ == 1) {
299 #ifdef ENABLE_AVX
300 if ((conv_param->stride_h_ > 1 || conv_param->stride_w_ > 1) &&
301 (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1) &&
302 (conv_param->kernel_w_ / conv_param->stride_w_ >= 2 || conv_param->kernel_h_ / conv_param->stride_h_ >= 2 ||
303 conv_param->output_channel_ == 1) &&
304 conv_param->input_w_ * conv_param->input_h_ >= 2000) {
305 // output_channel_ = 1 is not appropriate in gemm deconv in x86
306 kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(op_parameter, inputs, outputs,
307 static_cast<const lite::InnerContext *>(ctx));
308 } else {
309 kernel = new (std::nothrow)
310 kernel::DeConvolutionCPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
311 }
312 #else
313 if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
314 (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
315 kernel = new (std::nothrow) kernel::DeConvolutionWinogradCPUKernel(op_parameter, inputs, outputs,
316 static_cast<const lite::InnerContext *>(ctx));
317 } else {
318 kernel = new (std::nothrow)
319 kernel::DeConvolutionCPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
320 }
321 #endif
322 } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
323 kernel = new (std::nothrow) kernel::DeconvolutionDepthwiseCPUKernel(op_parameter, inputs, outputs,
324 static_cast<const lite::InnerContext *>(ctx));
325 } else {
326 MS_LOG(ERROR) << "deconv do not support group deconv!";
327 kernel = nullptr;
328 }
329
330 if (kernel == nullptr) {
331 MS_LOG(ERROR) << "kernel is nullptr.";
332 free(op_parameter);
333 return nullptr;
334 }
335 return kernel;
336 }
337
338 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2dTransposeFusion, CpuDeConvFp32KernelCreator)
339 } // namespace mindspore::kernel
340