1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h"
18 #include "include/errorcode.h"
19 #include "nnacl/int8/conv_depthwise_int8.h"
20
21 using mindspore::lite::RET_ERROR;
22 using mindspore::lite::RET_OK;
23
24 namespace mindspore::kernel {
~DeconvolutionDepthwiseInt8CPUKernel()25 DeconvolutionDepthwiseInt8CPUKernel::~DeconvolutionDepthwiseInt8CPUKernel() {
26 if (sliding_ != nullptr) {
27 delete sliding_;
28 sliding_ = nullptr;
29 }
30 if (packed_weight_ != nullptr) {
31 delete packed_weight_;
32 packed_weight_ = nullptr;
33 }
34 FreeQuantParam();
35 }
36
InitWeightBias()37 int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
38 // init weight: int8 -> int16
39 // o, h, w, i -> o/8, h, w, i, 8; o equals to group, i equals to 1
40 auto weight_tensor = in_tensors_.at(kWeightIndex);
41 CHECK_NULL_RETURN(weight_tensor);
42 auto origin_weight = reinterpret_cast<int8_t *>(weight_tensor->data());
43 CHECK_NULL_RETURN(origin_weight);
44 if (origin_weight == nullptr) {
45 MS_LOG(ERROR) << "origin_weight nullptr";
46 return RET_ERROR;
47 }
48 int OC4 = UP_DIV(weight_tensor->Batch(), C4NUM);
49 int pack_weight_size = C4NUM * OC4 * weight_tensor->Height() * weight_tensor->Width();
50 packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
51 if (packed_weight_ == nullptr) {
52 MS_LOG(ERROR) << "Malloc buffer failed.";
53 return RET_ERROR;
54 }
55 PackDeconvDepthwiseInt8Weight(origin_weight, packed_weight_, weight_tensor->Height() * weight_tensor->Width(),
56 weight_tensor->Batch(), &(conv_param_->conv_quant_arg_));
57
58 bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
59 if (bias_data_ == nullptr) {
60 MS_LOG(ERROR) << "Malloc buffer failed.";
61 return RET_ERROR;
62 }
63 memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
64 if (in_tensors_.size() == kInputSize2) {
65 auto bias_tensor = in_tensors_.at(kBiasIndex);
66 CHECK_NULL_RETURN(bias_tensor);
67 auto ori_bias = reinterpret_cast<int32_t *>(bias_tensor->data());
68 CHECK_NULL_RETURN(ori_bias);
69 memcpy(bias_data_, ori_bias, bias_tensor->ElementsNum() * sizeof(int32_t));
70 }
71 conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
72 return RET_OK;
73 }
74
InitSlideParam()75 int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
76 MS_CHECK_TRUE_RET(in_tensors_.front()->shape().size() == DIMENSION_4D, RET_ERROR);
77 MS_CHECK_TRUE_RET(out_tensors_.front()->shape().size() == DIMENSION_4D, RET_ERROR);
78
79 conv_param_->input_batch_ = out_tensors_.front()->shape().at(kNHWC_N);
80 conv_param_->input_h_ = out_tensors_.front()->shape().at(kNHWC_H);
81 conv_param_->input_w_ = out_tensors_.front()->shape().at(kNHWC_W);
82 conv_param_->input_channel_ = C4NUM;
83 conv_param_->output_batch_ = in_tensors_.front()->shape().at(kNHWC_N);
84 conv_param_->output_h_ = in_tensors_.front()->shape().at(kNHWC_H);
85 conv_param_->output_w_ = in_tensors_.front()->shape().at(kNHWC_W);
86 conv_param_->output_channel_ = in_tensors_.front()->shape().at(kNHWC_C);
87
88 InitSlidingParamConvDw(sliding_, conv_param_, C4NUM);
89
90 sliding_->in_h_step_ = conv_param_->input_w_ * C4NUM;
91 sliding_->in_sh_step_ = conv_param_->input_w_ * C4NUM * conv_param_->stride_h_; // stride H
92 sliding_->in_sw_step_ = C4NUM * conv_param_->stride_h_; // stride W
93 sliding_->in_kh_step_ = conv_param_->input_w_ * C4NUM * conv_param_->dilation_h_; // kernel H
94 sliding_->in_kw_step_ = C4NUM * conv_param_->dilation_w_; // kernel W
95 return RET_OK;
96 }
97
InitBuffer()98 int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
99 int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
100 UP_DIV(conv_param_->input_channel_, C4NUM);
101 packed_input_ = reinterpret_cast<int16_t *>(ms_context_->allocator->Malloc(pack_input_size * sizeof(int16_t)));
102 if (packed_input_ == nullptr) {
103 MS_LOG(ERROR) << "Malloc buffer failed.";
104 return RET_ERROR;
105 }
106
107 if (conv_param_->input_channel_ % C4NUM != 0) {
108 need_align_ = true;
109 int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
110 UP_DIV(conv_param_->output_channel_, C4NUM);
111 packed_output_ = reinterpret_cast<int8_t *>(ms_context_->allocator->Malloc(pack_output_size * sizeof(int8_t)));
112 if (packed_output_ == nullptr) {
113 MS_LOG(ERROR) << "Malloc buffer failed.";
114 return RET_ERROR;
115 }
116 memset(packed_output_, 0, pack_output_size * sizeof(int8_t));
117 }
118
119 output_buffer_ = reinterpret_cast<int32_t *>(ms_context_->allocator->Malloc(
120 conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * conv_param_->thread_num_ * sizeof(int32_t)));
121 if (output_buffer_ == nullptr) {
122 MS_LOG(ERROR) << "Malloc buffer failed.";
123 return RET_ERROR;
124 }
125 return RET_OK;
126 }
127
Init()128 int DeconvolutionDepthwiseInt8CPUKernel::Init() {
129 CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
130 CHECK_NULL_RETURN(in_tensors_.at(kInputIndex));
131 CHECK_NULL_RETURN(in_tensors_.at(kWeightIndex));
132 CHECK_NULL_RETURN(conv_param_);
133
134 sliding_ = new (std::nothrow) SlidingWindowParam;
135 if (sliding_ == nullptr) {
136 MS_LOG(ERROR) << "new SlidingWindowParam fail!";
137 return RET_ERROR;
138 }
139 auto ret = ConvolutionBaseCPUKernel::SetQuantParam();
140 if (ret != RET_OK) {
141 MS_LOG(ERROR) << "Set quant param failed.";
142 return ret;
143 }
144 ret = InitWeightBias();
145 if (ret != RET_OK) {
146 MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!";
147 return ret;
148 }
149 if (!InferShapeDone()) {
150 return RET_OK;
151 }
152 return ReSize();
153 }
154
ReSize()155 int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
156 CHECK_LESS_RETURN(in_tensors_.size(), 1);
157 CHECK_LESS_RETURN(out_tensors_.size(), 1);
158 CHECK_NULL_RETURN(in_tensors_.front());
159 CHECK_NULL_RETURN(out_tensors_.front());
160 CHECK_NULL_RETURN(conv_param_);
161 CHECK_NULL_RETURN(sliding_);
162
163 InitSlideParam();
164 ConvolutionBaseCPUKernel::Init();
165 return RET_OK;
166 }
167
DoExecute(int task_id)168 int DeconvolutionDepthwiseInt8CPUKernel::DoExecute(int task_id) {
169 auto buffer = output_buffer_ + conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * task_id;
170 DeconvDwInt8(packed_output_, buffer, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_),
171 conv_param_, sliding_, task_id);
172 return RET_OK;
173 }
174
DeconvDwInt8Run(void * cdata,int task_id,float lhs_scale,float rhs_scale)175 int DeconvDwInt8Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
176 auto deconv_dw_int8 = reinterpret_cast<DeconvolutionDepthwiseInt8CPUKernel *>(cdata);
177 auto ret = deconv_dw_int8->DoExecute(task_id);
178 if (ret != RET_OK) {
179 MS_LOG(ERROR) << "DeconvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
180 return RET_ERROR;
181 }
182 return RET_OK;
183 }
184
Run()185 int DeconvolutionDepthwiseInt8CPUKernel::Run() {
186 if (conv_param_->input_channel_ != conv_param_->output_channel_) {
187 MS_LOG(ERROR) << "Only support input channel equals output channel.";
188 return RET_ERROR;
189 }
190 auto ret = InitBuffer();
191 if (ret != RET_OK) {
192 MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!";
193 ms_context_->allocator->Free(packed_input_);
194 packed_input_ = nullptr;
195 ms_context_->allocator->Free(output_buffer_);
196 output_buffer_ = nullptr;
197 if (need_align_) {
198 ms_context_->allocator->Free(packed_output_);
199 }
200 return ret;
201 }
202
203 CHECK_NULL_RETURN(packed_weight_);
204 CHECK_NULL_RETURN(bias_data_);
205
206 auto input_tensor = in_tensors_.at(kInputIndex);
207 auto output_tensor = out_tensors_.at(kOutputIndex);
208 auto input_addr = reinterpret_cast<int8_t *>(input_tensor->data());
209 auto output_addr = reinterpret_cast<int8_t *>(output_tensor->data());
210 CHECK_NULL_RETURN(input_addr);
211 CHECK_NULL_RETURN(output_addr);
212
213 // pack input, assume input format: NHWC -> NHWC4
214 PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
215
216 if (!need_align_) {
217 memset(output_addr, 0, out_tensors_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t));
218 packed_output_ = output_addr;
219 }
220
221 ret = ParallelLaunch(this->ms_context_, DeconvDwInt8Run, this, conv_param_->thread_num_);
222 if (ret != RET_OK) {
223 MS_LOG(ERROR) << "DeconvDwInt8Run error: error_code[" << ret << "]";
224 }
225
226 if (need_align_) {
227 PackNHWC4ToNHWCInt8(packed_output_, output_addr, conv_param_->output_batch_,
228 conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
229 ms_context_->allocator->Free(packed_output_);
230 packed_output_ = nullptr;
231 }
232 ms_context_->allocator->Free(packed_input_);
233 packed_input_ = nullptr;
234 ms_context_->allocator->Free(output_buffer_);
235 output_buffer_ = nullptr;
236 return ret;
237 }
238 } // namespace mindspore::kernel
239