1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/runtime/kernel/arm/fp16/deconvolution_fp16.h"
18 #include "src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h"
19 #include "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h"
20
21 using mindspore::kernel::KERNEL_ARCH;
22 using mindspore::lite::KernelRegistrar;
23 using mindspore::lite::RET_ERROR;
24 using mindspore::lite::RET_NULL_PTR;
25 using mindspore::lite::RET_OK;
26 using mindspore::schema::PrimitiveType_Conv2dTransposeFusion;
27
28 namespace mindspore::kernel {
~DeConvolutionFp16CPUKernel()29 DeConvolutionFp16CPUKernel::~DeConvolutionFp16CPUKernel() {
30 if (matmul_param_ != nullptr) {
31 delete matmul_param_;
32 matmul_param_ = nullptr;
33 }
34 return;
35 }
36
ReSize()37 int DeConvolutionFp16CPUKernel::ReSize() {
38 CHECK_LESS_RETURN(in_tensors_.size(), 1);
39 CHECK_LESS_RETURN(out_tensors_.size(), 1);
40 CHECK_NULL_RETURN(conv_param_);
41 CHECK_NULL_RETURN(matmul_param_);
42
43 auto ret = ConvolutionBaseCPUKernel::Init();
44 if (ret != RET_OK) {
45 MS_LOG(ERROR) << "ConvolutionBaseCPUKernel Init error!";
46 return ret;
47 }
48 int error_code = InitParam();
49 if (error_code != RET_OK) {
50 MS_LOG(ERROR) << "deconv InitParam error!";
51 return error_code;
52 }
53 return RET_OK;
54 }
55
PackWeight()56 void DeConvolutionFp16CPUKernel::PackWeight() {
57 auto weight_tensor = in_tensors_.at(kWeightIndex);
58 auto input_channel = weight_tensor->Batch();
59 auto output_channel = weight_tensor->Channel();
60 auto kernel_h = weight_tensor->Height();
61 auto kernel_w = weight_tensor->Width();
62 void *origin_weight = (op_parameter_->is_train_session_) ? weight_tensor->data() : origin_weight_;
63 MS_ASSERT(origin_weight != nullptr);
64 PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(origin_weight), reinterpret_cast<float16_t *>(packed_weight_),
65 input_channel, kernel_w * kernel_h, output_channel);
66 }
67
MallocWeightBiasData()68 int DeConvolutionFp16CPUKernel::MallocWeightBiasData() {
69 auto weight_tensor = in_tensors_.at(kWeightIndex);
70 auto input_channel = weight_tensor->Batch();
71 auto output_channel = weight_tensor->Channel();
72 auto kernel_h = weight_tensor->Height();
73 auto kernel_w = weight_tensor->Width();
74 size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
75 if (!op_parameter_->is_train_session_) {
76 packed_weight_ = malloc(weight_pack_size);
77 if (packed_weight_ == nullptr) {
78 MS_LOG(ERROR) << "deconv malloc packed_weight_ error!";
79 return RET_ERROR;
80 }
81 memset(packed_weight_, 0, weight_pack_size);
82 }
83 auto bias_size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
84 bias_data_ = malloc(bias_size);
85 if (bias_data_ == nullptr) {
86 MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
87 return RET_ERROR;
88 }
89 memset(bias_data_, 0, UP_ROUND(output_channel, C8NUM) * sizeof(float16_t));
90 return RET_OK;
91 }
92
InitParam()93 int DeConvolutionFp16CPUKernel::InitParam() {
94 input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
95 kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_;
96 output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;
97
98 matmul_param_->row_ = input_plane_;
99 matmul_param_->deep_ = conv_param_->input_channel_;
100 matmul_param_->col_ = conv_param_->output_channel_ * kernel_plane_;
101 matmul_param_->row_16_ = UP_ROUND(matmul_param_->row_, C16NUM);
102 matmul_param_->col_8_ = UP_ROUND(conv_param_->output_channel_, C8NUM) * kernel_plane_;
103
104 thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(conv_param_->output_channel_, C8NUM));
105 NNACL_CHECK_ZERO_RETURN_ERR(thread_count_);
106 thread_stride_ = UP_DIV(UP_DIV(conv_param_->output_channel_, C8NUM), thread_count_);
107 return RET_OK;
108 }
109
InitRunBuf()110 int DeConvolutionFp16CPUKernel::InitRunBuf() {
111 pack_output_ = reinterpret_cast<float16_t *>(
112 ctx_->allocator->Malloc(UP_ROUND(conv_param_->output_channel_, C8NUM) * output_plane_ * sizeof(float16_t)));
113 if (pack_output_ == nullptr) {
114 MS_LOG(ERROR) << "deconv Malloc pack_output_ error!";
115 return RET_NULL_PTR;
116 }
117
118 tmp_buffer_ = reinterpret_cast<float16_t *>(
119 ctx_->allocator->Malloc(matmul_param_->row_16_ * matmul_param_->col_8_ * sizeof(float16_t)));
120 if (tmp_buffer_ == nullptr) {
121 MS_LOG(ERROR) << "deconv Malloc tmp_buffer_ error!";
122 return RET_ERROR;
123 }
124
125 pack_input_ =
126 reinterpret_cast<float16_t *>(malloc(matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t)));
127 if (pack_input_ == nullptr) {
128 MS_LOG(ERROR) << "deconv Malloc pack_input_ error!";
129 return RET_ERROR;
130 }
131 return RET_OK;
132 }
133
FreeRunBuf()134 void DeConvolutionFp16CPUKernel::FreeRunBuf() {
135 if (tmp_buffer_ != nullptr) {
136 ctx_->allocator->Free(tmp_buffer_);
137 tmp_buffer_ = nullptr;
138 }
139 if (pack_output_ != nullptr) {
140 ctx_->allocator->Free(pack_output_);
141 pack_output_ = nullptr;
142 }
143 if (pack_input_ != nullptr) {
144 ctx_->allocator->Free(pack_input_);
145 pack_input_ = nullptr;
146 }
147 return;
148 }
149
DeConvFp16Run(void * cdata,int task_id,float lhs_scale,float rhs_scale)150 static int DeConvFp16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
151 auto deconv = reinterpret_cast<DeConvolutionFp16CPUKernel *>(cdata);
152 auto error_code = deconv->DoDeconv(task_id);
153 if (error_code != RET_OK) {
154 MS_LOG(ERROR) << "DeConvFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]";
155 return RET_ERROR;
156 }
157 return RET_OK;
158 }
159
DoDeconv(int task_id)160 int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
161 int cur_stride = UP_DIV(conv_param_->output_channel_, C8NUM) - task_id * thread_stride_;
162 int oc = MSMIN(thread_stride_, cur_stride);
163 cur_stride = conv_param_->output_channel_ - task_id * thread_stride_ * C8NUM;
164 int oc_res = MSMIN(thread_stride_ * C8NUM, cur_stride);
165 if (oc <= 0) {
166 return RET_OK;
167 }
168
169 auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_;
170 MatMulFp16(pack_input_,
171 reinterpret_cast<float16_t *>(packed_weight_) +
172 task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
173 tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
174 OutType_C8);
175
176 DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
177 reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM,
178 batch_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
179 return RET_OK;
180 }
181
Init()182 int DeConvolutionFp16CPUKernel::Init() {
183 CHECK_LESS_RETURN(in_tensors_.size(), 2);
184 CHECK_LESS_RETURN(out_tensors_.size(), 1);
185 CHECK_NULL_RETURN(conv_param_);
186 CHECK_NULL_RETURN(in_tensors_.at(kInputIndex));
187 CHECK_NULL_RETURN(in_tensors_.at(kWeightIndex));
188 UpdateOriginWeightAndBias();
189
190 if (op_parameter_->is_train_session_) {
191 auto weight_tensor = in_tensors_.at(kWeightIndex);
192 auto input_channel = weight_tensor->Batch();
193 auto output_channel = weight_tensor->Channel();
194 auto kernel_h = weight_tensor->Height();
195 auto kernel_w = weight_tensor->Width();
196 size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
197 set_workspace_size(weight_pack_size);
198 }
199 matmul_param_ = new (std::nothrow) MatMulParameter();
200 if (matmul_param_ == nullptr) {
201 MS_LOG(ERROR) << "Memory allocation failed";
202 return RET_ERROR;
203 }
204 int ret = InitConvWeightBias();
205 if (ret != RET_OK) {
206 MS_LOG(ERROR) << "deconv InitConvWeightBias error!";
207 return ret;
208 }
209 if (!InferShapeDone()) {
210 return RET_OK;
211 }
212 return ReSize();
213 }
214
Run()215 int DeConvolutionFp16CPUKernel::Run() {
216 if (RepackWeight() != RET_OK) {
217 MS_LOG(ERROR) << "Repack weight failed.";
218 return RET_ERROR;
219 }
220
221 auto input_tensor = in_tensors_.at(kInputIndex);
222 auto output_tensor = out_tensors_.at(kOutputIndex);
223 auto *input_ptr = reinterpret_cast<float16_t *>(input_tensor->data());
224 auto *output_ptr = reinterpret_cast<float16_t *>(output_tensor->data());
225 CHECK_NULL_RETURN(input_ptr);
226 CHECK_NULL_RETURN(output_ptr);
227
228 int error_code = InitRunBuf();
229 if (error_code != RET_OK) {
230 MS_LOG(ERROR) << "deconv fp16 InitRunBuf error! error_code[" << error_code << "]";
231 FreeRunBuf();
232 return RET_ERROR;
233 }
234
235 for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
236 batch_input_ = input_ptr + batch_index * conv_param_->input_channel_ * input_plane_;
237 batch_output_ = output_ptr + batch_index * conv_param_->output_channel_ * output_plane_;
238
239 RowMajor2Col16MajorFp16Opt(batch_input_, pack_input_, input_plane_, conv_param_->input_channel_);
240
241 error_code = ParallelLaunch(this->ms_context_, DeConvFp16Run, this, thread_count_);
242 if (error_code != RET_OK) {
243 MS_LOG(ERROR) << "deconv fp16 run error! error_code[" << error_code << "]";
244 FreeRunBuf();
245 return error_code;
246 }
247 }
248
249 FreeRunBuf();
250 return error_code;
251 }
252
CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor * > & inputs,const std::vector<lite::Tensor * > & outputs,OpParameter * op_parameter,const lite::Context * ctx,const kernel::KernelKey & desc)253 kernel::InnerKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
254 const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
255 const lite::Context *ctx, const kernel::KernelKey &desc) {
256 MS_CHECK_TRUE_RET(op_parameter != nullptr, nullptr);
257 MS_CHECK_TRUE_RET(ctx != nullptr, nullptr);
258 MS_ASSERT(desc.type == schema::PrimitiveType_Conv2dTransposeFusion);
259
260 kernel::InnerKernel *kernel = nullptr;
261 auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
262 if (conv_param->group_ == 1) {
263 if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
264 (conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1)) {
265 kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs,
266 static_cast<const lite::InnerContext *>(ctx));
267 } else {
268 kernel = new (std::nothrow)
269 kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
270 }
271 } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
272 kernel = new (std::nothrow)
273 DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
274 }
275
276 if (kernel == nullptr) {
277 MS_LOG(ERROR) << "kernel is nullptr.";
278 free(op_parameter);
279 return nullptr;
280 }
281 return kernel;
282 }
283
284 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2dTransposeFusion, CpuDeConvFp16KernelCreator)
285 } // namespace mindspore::kernel
286