1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h"
18 #include "nnacl/fp16/matmul_fp16.h"
19 #include "nnacl/fp16/cast_fp16.h"
20 #include "include/errorcode.h"
21
22 using mindspore::lite::RET_ERROR;
23 using mindspore::lite::RET_INPUT_TENSOR_ERROR;
24 using mindspore::lite::RET_MEMORY_FAILED;
25 using mindspore::lite::RET_OK;
26
27 namespace mindspore::kernel {
MatmulBaseFP16Run(void * cdata,int task_id,float lhs_scale,float rhs_scale)28 int MatmulBaseFP16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
29 CHECK_NULL_RETURN(cdata);
30 auto op = reinterpret_cast<MatmulBaseFP16CPUKernel *>(cdata);
31 auto error_code = op->RunImpl(task_id);
32 if (error_code != RET_OK) {
33 MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]";
34 return RET_ERROR;
35 }
36 return RET_OK;
37 }
38
~MatmulBaseFP16CPUKernel()39 MatmulBaseFP16CPUKernel::~MatmulBaseFP16CPUKernel() {
40 if (src_b_ != nullptr) {
41 free(src_b_);
42 src_b_ = nullptr;
43 }
44 if (bias_ptr_ != nullptr) {
45 free(bias_ptr_);
46 bias_ptr_ = nullptr;
47 }
48 FreeResizeBufA();
49 FreeResizeBufB();
50 }
51
FreeResizeBufA()52 void MatmulBaseFP16CPUKernel::FreeResizeBufA() {
53 if (a_pack_ptr_ != nullptr) {
54 ms_context_->allocator->Free(a_pack_ptr_);
55 a_pack_ptr_ = nullptr;
56 }
57 return;
58 }
59
FreeResizeBufB()60 void MatmulBaseFP16CPUKernel::FreeResizeBufB() {
61 if (b_pack_ptr_ != nullptr) {
62 ms_context_->allocator->Free(b_pack_ptr_);
63 b_pack_ptr_ = nullptr;
64 }
65 return;
66 }
67
InitParameter()68 void MatmulBaseFP16CPUKernel::InitParameter() {
69 NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[0]);
70 NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[1]);
71 params_->a_const_ = (in_tensors_[0]->data() != nullptr);
72 params_->b_const_ = (in_tensors_[1]->data() != nullptr);
73 }
74
InitBias()75 int MatmulBaseFP16CPUKernel::InitBias() {
76 if (params_->col_ != 0 && bias_ptr_ == nullptr) {
77 int max_bias_data = UP_ROUND(params_->col_, C16NUM);
78 bias_ptr_ = reinterpret_cast<float16_t *>(malloc(max_bias_data * sizeof(float16_t)));
79 if (bias_ptr_ == nullptr) {
80 MS_LOG(ERROR) << "malloc bias_ptr_ failed";
81 return RET_ERROR;
82 }
83 if (in_tensors_.size() == 3) {
84 auto bias_tensor = in_tensors_[2];
85 CHECK_NULL_RETURN(bias_tensor);
86 memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * sizeof(float16_t));
87 } else {
88 memset(bias_ptr_, 0, max_bias_data * sizeof(float16_t));
89 }
90 }
91 return RET_OK;
92 }
93
ReSize()94 int MatmulBaseFP16CPUKernel::ReSize() {
95 ResizeParameter();
96
97 if (params_->b_const_ == true && src_b_ != nullptr) {
98 InitBufferB();
99 InitMatrixB(src_b_, kNumberTypeFloat16);
100 free(src_b_);
101 src_b_ = nullptr;
102 }
103 if (vec_matmul_) {
104 #ifdef ENABLE_ARM64
105 thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C16NUM));
106 thread_stride_ = UP_DIV(UP_DIV(params_->col_, C16NUM), thread_count_) * C16NUM;
107 #else
108 thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM));
109 thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;
110 #endif
111 } else {
112 thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM));
113 thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;
114 }
115 return RET_OK;
116 }
117
ResizeParameter()118 void MatmulBaseFP16CPUKernel::ResizeParameter() {
119 if (params_->row_ == 1) {
120 vec_matmul_ = true;
121 }
122
123 if (vec_matmul_) {
124 params_->row_align_ = 1;
125 #ifdef ENABLE_ARM64
126 params_->col_align_ = UP_ROUND(params_->col_, C16NUM);
127 #else
128 params_->col_align_ = params_->col_;
129 #endif
130 } else {
131 params_->row_align_ = UP_ROUND(params_->row_, row_tile_);
132 params_->col_align_ = UP_ROUND(params_->col_, C8NUM);
133 }
134 }
135
InitBufferA()136 int MatmulBaseFP16CPUKernel::InitBufferA() {
137 a_pack_ptr_ = reinterpret_cast<float16_t *>(
138 ms_context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t)));
139 if (a_pack_ptr_ == nullptr) {
140 return RET_MEMORY_FAILED;
141 }
142
143 memset(a_pack_ptr_, 0, params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t));
144 return RET_OK;
145 }
146
InitBufferB()147 int MatmulBaseFP16CPUKernel::InitBufferB() {
148 if (b_pack_ptr_ != nullptr) {
149 return RET_OK;
150 }
151
152 b_pack_ptr_ = reinterpret_cast<float16_t *>(
153 ms_context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t)));
154 if (b_pack_ptr_ == nullptr) {
155 return RET_MEMORY_FAILED;
156 }
157
158 memset(b_pack_ptr_, 0, params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t));
159 return RET_OK;
160 }
161
InitMatrixA(const void * src_ptr)162 void MatmulBaseFP16CPUKernel::InitMatrixA(const void *src_ptr) {
163 NNACL_CHECK_NULL_RETURN_VOID(src_ptr);
164 auto src_data_type = in_tensors_[0]->data_type();
165
166 if (vec_matmul_) {
167 if (src_data_type == kNumberTypeFloat32) {
168 Float32ToFloat16(reinterpret_cast<const float *>(src_ptr), a_pack_ptr_, params_->batch * params_->deep_);
169 } else {
170 memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * sizeof(float16_t));
171 }
172 return;
173 }
174
175 const int8_t *int8_src = reinterpret_cast<const int8_t *>(src_ptr);
176 for (int i = 0; i < params_->batch; i++) {
177 const int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type);
178 float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
179 if (params_->a_transpose_) {
180 #ifdef ENABLE_ARM64
181 RowMajor2RowNMajorFp16((const float16_t *)src, dst, params_->deep_, params_->row_);
182 #else
183 RowMajor2Row12MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32);
184 #endif
185 } else {
186 #ifdef ENABLE_ARM64
187 RowMajor2ColNMajorFp16((const float16_t *)src, dst, params_->row_, params_->deep_);
188 #else
189 RowMajor2Col12MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32);
190 #endif
191 }
192 }
193 return;
194 }
195
InitMatrixB(const void * src_ptr,TypeId src_data_type)196 void MatmulBaseFP16CPUKernel::InitMatrixB(const void *src_ptr, TypeId src_data_type) {
197 NNACL_CHECK_NULL_RETURN_VOID(src_ptr);
198 const int8_t *int8_src = reinterpret_cast<const int8_t *>(src_ptr);
199
200 if (vec_matmul_) {
201 if (params_->b_transpose_) {
202 if (src_data_type == kNumberTypeFloat32) {
203 Float32ToFloat16(reinterpret_cast<const float *>(src_ptr), b_pack_ptr_,
204 params_->batch * params_->col_ * params_->deep_);
205 } else {
206 #ifdef ENABLE_ARM64
207 for (auto i = 0; i < params_->batch; ++i) {
208 const auto *b_src = reinterpret_cast<const float16_t *>(src_ptr) + i * params_->col_align_ * params_->deep_;
209 auto *dst = b_pack_ptr_ + i * params_->col_align_ * params_->deep_;
210 RowMajor2Col16MajorFp16Opt(b_src, dst, params_->col_, params_->deep_);
211 }
212 #else
213 memcpy(b_pack_ptr_, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t));
214 #endif
215 }
216 } else {
217 for (int i = 0; i < params_->batch; i++) {
218 #ifdef ENABLE_ARM64
219 const auto *b_src = reinterpret_cast<const float16_t *>(src_ptr) + i * params_->col_align_ * params_->deep_;
220 auto *dst = b_pack_ptr_ + i * params_->col_align_ * params_->deep_;
221 RowMajor2Row16MajorFp16Opt(b_src, dst, params_->deep_, params_->col_);
222 #else
223 const int8_t *batch_src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
224 float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_;
225 RowMajor2ColMajorFp16(batch_src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32);
226 #endif
227 }
228 }
229 return;
230 }
231
232 for (int i = 0; i < params_->batch; i++) {
233 const int8_t *src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
234 float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
235 if (params_->b_transpose_) {
236 RowMajor2Col8MajorFp16(src, dst, params_->col_, params_->deep_, src_data_type == kNumberTypeFloat32);
237 } else {
238 RowMajor2Row8MajorFp16(src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32);
239 }
240 }
241 return;
242 }
243
Init()244 int MatmulBaseFP16CPUKernel::Init() {
245 CHECK_LESS_RETURN(in_tensors_.size(), 2);
246 CHECK_LESS_RETURN(out_tensors_.size(), 1);
247 ResizeParameter();
248 if (params_->a_const_ == true) {
249 if (RET_OK != InitBufferA()) {
250 return RET_ERROR;
251 }
252 MS_ASSERT(in_tensors_[0] != nullptr);
253 MS_ASSERT(in_tensors_[0]->data() != nullptr);
254 InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data()));
255 }
256
257 if (params_->b_const_ == true) {
258 /* copy origin b data, pack in resize
259 * pack after a infershape done */
260 auto b_tensor = in_tensors_[1];
261 MS_ASSERT(b_tensor != nullptr);
262 MS_ASSERT(b_tensor->data() != nullptr);
263 src_b_ = reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t)));
264 if (src_b_ == nullptr) {
265 MS_LOG(ERROR) << "Matmul fp16 malloc src_b_ failed";
266 return RET_ERROR;
267 }
268
269 if (b_tensor->data_type() == kNumberTypeFloat32) {
270 Float32ToFloat16(reinterpret_cast<float *>(b_tensor->data()), src_b_,
271 params_->batch * params_->col_ * params_->deep_);
272 } else {
273 memcpy(src_b_, b_tensor->data(), params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t));
274 }
275 }
276
277 auto ret = InitBias();
278 if (ret != RET_OK) {
279 MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed";
280 return RET_ERROR;
281 }
282 return RET_OK;
283 }
284
RunImpl(int task_id)285 int MatmulBaseFP16CPUKernel::RunImpl(int task_id) {
286 int cur_stride = params_->col_ - task_id * thread_stride_;
287 int cur_oc = MSMIN(thread_stride_, cur_stride);
288 if (cur_oc <= 0) {
289 return RET_OK;
290 }
291
292 auto bias = bias_ptr_ + thread_stride_ * task_id;
293 auto b = batch_b_ptr_ + task_id * thread_stride_ * params_->deep_;
294 auto c = batch_c_ptr_ + task_id * thread_stride_;
295
296 if (vec_matmul_) {
297 #ifdef ENABLE_ARM64
298 VecMatmulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
299 #else
300 MatVecMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
301 #endif
302 } else {
303 #ifdef ENABLE_ARM64
304 MatmulBaseFp16Neon(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc,
305 params_->col_, OutType_Nhwc);
306 #else
307 MatMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_,
308 OutType_Nhwc);
309 #endif
310 }
311 return RET_OK;
312 }
313
Run()314 int MatmulBaseFP16CPUKernel::Run() {
315 auto c_ptr = reinterpret_cast<float16_t *>(out_tensors_[0]->data());
316 CHECK_NULL_RETURN(c_ptr);
317
318 if ((params_->a_const_ == false) || IsRepack()) {
319 if (RET_OK != InitBufferA()) {
320 return RET_ERROR;
321 }
322 InitMatrixA(in_tensors_[0]->data());
323 }
324 if ((params_->b_const_ == false) || IsRepack()) {
325 if (RET_OK != InitBufferB()) {
326 FreeResizeBufA();
327 return RET_ERROR;
328 }
329 InitMatrixB(in_tensors_[1]->data(), in_tensors_[1]->data_type());
330 InitBias();
331 }
332
333 for (int i = 0; i < params_->batch; ++i) {
334 if (vec_matmul_) {
335 batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_;
336 #ifdef ENABLE_ARM64
337 batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
338 #else
339 batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_;
340 #endif
341 batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
342 } else {
343 batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_;
344 batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
345 batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
346 }
347 auto ret = ParallelLaunch(this->ms_context_, MatmulBaseFP16Run, this, thread_count_);
348 if (ret != RET_OK) {
349 MS_LOG(ERROR) << "MatmulBaseFloatRun failed";
350 return ret;
351 }
352 }
353
354 if (params_->a_const_ == false) {
355 FreeResizeBufA();
356 }
357
358 if (params_->b_const_ == false) {
359 FreeResizeBufB();
360 }
361 return RET_OK;
362 }
363 } // namespace mindspore::kernel
364