1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h"
18 #include "nnacl/base/conv1x1_base.h"
19 #include "nnacl/fp16/conv_fp16.h"
20 #include "nnacl/fp16/cast_fp16.h"
21 #include "nnacl/fp16/pack_fp16.h"
22 #include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
23 #include "include/errorcode.h"
24
25 using mindspore::lite::RET_ERROR;
26 using mindspore::lite::RET_MEMORY_FAILED;
27 using mindspore::lite::RET_OK;
28
29 namespace mindspore::kernel {
InitMatmulParam()30 int Convolution1x1FP16CPUKernel::InitMatmulParam() {
31 matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
32 matmul_param_->col_ = conv_param_->output_channel_;
33 matmul_param_->deep_ = conv_param_->input_channel_;
34 matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_);
35 matmul_param_->col_align_ = UP_ROUND(matmul_param_->col_, col_tile_);
36 matmul_param_->act_type_ = conv_param_->act_type_;
37 return RET_OK;
38 }
39
~Convolution1x1FP16CPUKernel()40 Convolution1x1FP16CPUKernel::~Convolution1x1FP16CPUKernel() {
41 FreeTmpBuffer();
42 if (matmul_param_ != nullptr) {
43 delete matmul_param_;
44 matmul_param_ = nullptr;
45 }
46 return;
47 }
48
InitConv1x1Param()49 int Convolution1x1FP16CPUKernel::InitConv1x1Param() {
50 pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
51 conv_param_->stride_w_ != 1);
52
53 if ((matmul_param_->row_ > (row_tile_ * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) {
54 multi_thread_by_hw_ = true;
55 thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_tile_));
56 if (thread_count_ <= 0) {
57 MS_LOG(ERROR) << "thread_count_ must be greater than 0!";
58 return RET_ERROR;
59 }
60 thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, row_tile_), thread_count_) * row_tile_;
61 } else {
62 multi_thread_by_hw_ = false;
63 thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_tile_));
64 if (thread_count_ <= 0) {
65 MS_LOG(ERROR) << "thread_count_ must be greater than 0!";
66 return RET_ERROR;
67 }
68 thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_tile_), thread_count_) * col_tile_;
69 }
70 matmul_param_->op_parameter_.thread_num_ = thread_count_;
71
72 if (pre_trans_input_) {
73 input_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t)));
74 if (input_ptr_ == nullptr) {
75 MS_LOG(ERROR) << "Conv1x1 Malloc input_ptr_ error!";
76 return RET_MEMORY_FAILED;
77 }
78 memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t));
79 }
80 return RET_OK;
81 }
82
MallocWeightBiasData()83 int Convolution1x1FP16CPUKernel::MallocWeightBiasData() {
84 auto weight_tensor = in_tensors_.at(kWeightIndex);
85 auto input_channel = weight_tensor->Channel();
86 auto output_channel = weight_tensor->Batch();
87
88 size_t size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float16_t);
89 if (!op_parameter_->is_train_session_) {
90 if (packed_weight_ == nullptr) {
91 packed_weight_ = malloc(size);
92 if (packed_weight_ == nullptr) {
93 MS_LOG(ERROR) << "Conv1x1 Malloc packed_weight_ error!";
94 return RET_ERROR;
95 }
96 }
97 memset(packed_weight_, 0, size);
98 }
99
100 if (in_tensors_.size() == kInputSize2) {
101 size = UP_ROUND(output_channel, col_tile_) * sizeof(float16_t);
102 if (bias_data_ == nullptr) {
103 bias_data_ = malloc(size);
104 if (bias_data_ == nullptr) {
105 MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
106 return RET_ERROR;
107 }
108 }
109 memset(bias_data_, 0, size);
110 }
111 return RET_OK;
112 }
113
PackWeight()114 void Convolution1x1FP16CPUKernel::PackWeight() {
115 auto weight_tensor = in_tensors_.at(kWeightIndex);
116 auto input_channel = weight_tensor->Channel();
117 auto output_channel = weight_tensor->Batch();
118 void *weight_origin = (op_parameter_->is_train_session_) ? weight_tensor->data() : origin_weight_;
119 MS_ASSERT(weight_origin != nullptr);
120 #ifdef ENABLE_ARM64
121 if (out_tensors_.front()->format() == NC4HW4) {
122 ColMajor2Row8MajorFp16(weight_origin, reinterpret_cast<float16_t *>(packed_weight_), input_channel, output_channel,
123 true);
124 } else {
125 RowMajor2Col16MajorFp16Opt(static_cast<const float16_t *>(weight_origin),
126 reinterpret_cast<float16_t *>(packed_weight_), output_channel, input_channel);
127 }
128 #else
129 ColMajor2Row8MajorFp16(weight_origin, reinterpret_cast<float16_t *>(packed_weight_), input_channel, output_channel,
130 true);
131 #endif
132 }
133
Init()134 int Convolution1x1FP16CPUKernel::Init() {
135 CHECK_LESS_RETURN(in_tensors_.size(), 2);
136 CHECK_LESS_RETURN(out_tensors_.size(), 1);
137 UpdateOriginWeightAndBias();
138 #ifdef ENABLE_ARM64
139 if (out_tensors_.front()->format() == NC4HW4) {
140 row_tile_ = C16NUM;
141 col_tile_ = C8NUM;
142 } else {
143 row_tile_ = C12NUM;
144 col_tile_ = C16NUM;
145 }
146 #else
147 row_tile_ = C12NUM;
148 col_tile_ = C8NUM;
149 #endif
150 if (op_parameter_->is_train_session_) {
151 auto weight_tensor = in_tensors_.at(kWeightIndex);
152 CHECK_NULL_RETURN(weight_tensor);
153 auto input_channel = weight_tensor->Channel();
154 auto output_channel = weight_tensor->Batch();
155 size_t size = input_channel * UP_ROUND(output_channel, col_tile_) * sizeof(float16_t);
156 set_workspace_size(size);
157 }
158 matmul_param_ = new (std::nothrow) MatMulParameter();
159 if (matmul_param_ == nullptr) {
160 MS_LOG(ERROR) << "Init matmul_param_ failed.";
161 return RET_ERROR;
162 }
163 int ret = InitConvWeightBias();
164 if (ret != RET_OK) {
165 MS_LOG(ERROR) << "Init weight bias failed.";
166 return ret;
167 }
168 return RET_OK;
169 }
170
FreeTmpBuffer()171 void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {
172 if (pre_trans_input_ && input_ptr_ != nullptr) {
173 free(input_ptr_);
174 input_ptr_ = nullptr;
175 }
176 return;
177 }
178
ReSize()179 int Convolution1x1FP16CPUKernel::ReSize() {
180 FreeTmpBuffer();
181 auto ret = ConvolutionBaseCPUKernel::Init();
182 if (ret != RET_OK) {
183 MS_LOG(ERROR) << "ConvolutionBase init failed.";
184 return ret;
185 }
186 ret = InitMatmulParam();
187 if (ret != RET_OK) {
188 MS_LOG(ERROR) << "Init matmul param failed.";
189 return ret;
190 }
191 ret = InitConv1x1Param();
192 if (ret != RET_OK) {
193 MS_LOG(ERROR) << "Init conv1x1 param failed.";
194 return ret;
195 }
196 return RET_OK;
197 }
198
RunOc(int task_id)199 int Convolution1x1FP16CPUKernel::RunOc(int task_id) {
200 int cur_stride = matmul_param_->col_ - task_id * thread_stride_;
201 int cur_oc = MSMIN(thread_stride_, cur_stride);
202 if (cur_oc <= 0) {
203 return RET_OK;
204 }
205
206 auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float16_t *>(bias_data_) + thread_stride_ * task_id;
207
208 if (out_tensors_.front()->format() == NC4HW4) {
209 Conv1x1OutNc8hw8MultiThreadByWeightFp16(input_ptr_, pack_input_, reinterpret_cast<float16_t *>(packed_weight_),
210 reinterpret_cast<float16_t *>(bias_data_), output_ptr_, task_id,
211 matmul_param_);
212 } else {
213 #ifdef ENABLE_ARM64
214 MatMul12x16Fp16Opt(pack_input_,
215 reinterpret_cast<float16_t *>(packed_weight_) + task_id * thread_stride_ * matmul_param_->deep_,
216 output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
217 matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
218 #else
219 MatMul12x8A32Fp16(pack_input_,
220 reinterpret_cast<float16_t *>(packed_weight_) + task_id * thread_stride_ * matmul_param_->deep_,
221 output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
222 matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
223 #endif
224 }
225 return RET_OK;
226 }
227
RunHw(int task_id)228 int Convolution1x1FP16CPUKernel::RunHw(int task_id) {
229 if (out_tensors_.front()->format() == NC4HW4) {
230 Conv1x1OutNc8hw8MultiThreadByInputFp16(input_ptr_, pack_input_, reinterpret_cast<float16_t *>(packed_weight_),
231 reinterpret_cast<float16_t *>(bias_data_), output_ptr_, task_id,
232 matmul_param_);
233 } else {
234 int res_stride = matmul_param_->row_ - task_id * thread_stride_;
235 int cur_hw_ = MSMIN(thread_stride_, res_stride);
236 if (cur_hw_ <= 0) {
237 return RET_OK;
238 }
239 float16_t *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_;
240 float16_t *thread_pack_input = pack_input_ + task_id * thread_stride_ * matmul_param_->deep_;
241 float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
242
243 RowMajor2Col12MajorFp16Opt(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_);
244 #ifdef ENABLE_ARM64
245 MatMul12x16Fp16Opt(thread_pack_input, reinterpret_cast<float16_t *>(packed_weight_), thread_output_ptr,
246 reinterpret_cast<float16_t *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_,
247 cur_hw_, matmul_param_->col_, matmul_param_->col_, OutType_Nhwc);
248 #else
249 MatMul12x8A32Fp16(thread_pack_input, reinterpret_cast<float16_t *>(packed_weight_), thread_output_ptr,
250 reinterpret_cast<float16_t *>(bias_data_), matmul_param_->act_type_, matmul_param_->deep_,
251 cur_hw_, matmul_param_->col_, matmul_param_->col_, OutType_Nhwc);
252 #endif
253 }
254 return RET_OK;
255 }
256
Convolution1x1Fp16RunOc(void * cdata,int task_id,float lhs_scale,float rhs_scale)257 static int Convolution1x1Fp16RunOc(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
258 auto conv = reinterpret_cast<Convolution1x1FP16CPUKernel *>(cdata);
259 auto error_code = conv->RunOc(task_id);
260 if (error_code != RET_OK) {
261 MS_LOG(ERROR) << "Convolution1x1 Fp16 Run error task_id[" << task_id << "] error_code[" << error_code << "]";
262 return RET_ERROR;
263 }
264 return RET_OK;
265 }
266
Convolution1x1Fp16RunHw(void * cdata,int task_id,float lhs_scale,float rhs_scale)267 static int Convolution1x1Fp16RunHw(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
268 auto conv = reinterpret_cast<Convolution1x1FP16CPUKernel *>(cdata);
269 auto error_code = conv->RunHw(task_id);
270 if (error_code != RET_OK) {
271 MS_LOG(ERROR) << "Convolution1x1 Fp16 Run hw error task_id[" << task_id << "] error_code[" << error_code << "]";
272 return RET_ERROR;
273 }
274 return RET_OK;
275 }
276
Run()277 int Convolution1x1FP16CPUKernel::Run() {
278 auto input_data = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data());
279 auto output_data = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data());
280 MS_ASSERT(input_data != nullptr);
281 MS_ASSERT(output_data != nullptr);
282 if (input_data == nullptr || output_data == nullptr) {
283 MS_LOG(ERROR) << "Convolution1x1 Fp16 get null tensor data!";
284 return RET_ERROR;
285 }
286 pack_input_ = reinterpret_cast<float16_t *>(
287 ctx_->allocator->Malloc(matmul_param_->row_align_ * matmul_param_->deep_ * sizeof(float16_t)));
288 if (pack_input_ == nullptr) {
289 MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!";
290 return RET_MEMORY_FAILED;
291 }
292 if (RepackWeight() != RET_OK) {
293 MS_LOG(ERROR) << "Repack weight failed.";
294 return RET_ERROR;
295 }
296
297 for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
298 output_ptr_ = output_data + batch_index * matmul_param_->row_ * matmul_param_->col_;
299 float16_t *batch_in =
300 input_data + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_;
301 if (pre_trans_input_) {
302 Conv1x1InputPack(batch_in, input_ptr_, conv_param_, sizeof(float16_t));
303 } else {
304 input_ptr_ = batch_in;
305 }
306
307 int ret = RET_ERROR;
308 if (multi_thread_by_hw_) {
309 ret = ParallelLaunch(this->ms_context_, Convolution1x1Fp16RunHw, this, thread_count_);
310 } else {
311 if (out_tensors_.front()->format() == NC4HW4) {
312 #ifdef ENABLE_ARM64
313 RowMajor2Col16MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
314 #else
315 RowMajor2Col12MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
316 #endif
317 } else {
318 RowMajor2Col12MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
319 }
320 ret = ParallelLaunch(this->ms_context_, Convolution1x1Fp16RunOc, this, thread_count_);
321 }
322 if (ret != RET_OK) {
323 MS_LOG(ERROR) << "ParallelLaunch failed.";
324 ctx_->allocator->Free(pack_input_);
325 pack_input_ = nullptr;
326 return ret;
327 }
328 }
329 ctx_->allocator->Free(pack_input_);
330 pack_input_ = nullptr;
331 return RET_OK;
332 }
333 } // namespace mindspore::kernel
334