1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h"
18 #include "nnacl/fp32/matmul_fp32.h"
19 #include "nnacl/fp32/pack_fp32.h"
20
21 using mindspore::lite::RET_NULL_PTR;
22
23 namespace mindspore::kernel {
MatmulBaseFloatRun(void * cdata,int task_id,float lhs_scale,float rhs_scale)24 int MatmulBaseFloatRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
25 CHECK_NULL_RETURN(cdata);
26 auto op = reinterpret_cast<MatmulFp32BaseCPUKernel *>(cdata);
27 auto error_code = op->FloatRun(task_id);
28 if (error_code != RET_OK) {
29 MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
30 return RET_ERROR;
31 }
32 return RET_OK;
33 }
34
~MatmulFp32BaseCPUKernel()35 MatmulFp32BaseCPUKernel::~MatmulFp32BaseCPUKernel() {
36 FreeResizeBufA();
37 FreeResizeBufB();
38 FreeBiasBuf();
39 FreeBuffSrcB();
40 }
41
InitParameter()42 void MatmulFp32BaseCPUKernel::InitParameter() {
43 NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[kInputIndex]);
44 NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[kWeightIndex]);
45 params_->a_const_ = (in_tensors_[kInputIndex]->data() != nullptr);
46 params_->b_const_ = (in_tensors_[kWeightIndex]->data() != nullptr);
47
48 if (op_parameter_->is_train_session_) {
49 params_->a_const_ = false;
50 params_->b_const_ = false;
51 }
52 }
53
ResizeParameter()54 void MatmulFp32BaseCPUKernel::ResizeParameter() {
55 init_global_variable();
56 if (params_->row_ == 1) {
57 vec_matmul_ = true;
58 #ifdef ENABLE_AVX
59 // vector matmul col is aligned to C8NUM in avx
60 col_tile_ = C8NUM;
61 #elif defined(ENABLE_ARM64)
62 col_tile_ = C8NUM;
63 #endif
64 row_tile_ = 1;
65 }
66 params_->row_align_ = UP_ROUND(params_->row_, row_tile_);
67 #ifdef ENABLE_AVX
68 // avx is aligned to col_tile_
69 params_->col_align_ = UP_ROUND(params_->col_, col_tile_);
70 #elif defined(ENABLE_ARM64)
71 // no matter vec_matmul_ or not, use col_tile_ to get col_align_
72 params_->col_align_ = UP_ROUND(params_->col_, col_tile_);
73 #else
74 params_->col_align_ = vec_matmul_ ? params_->col_ : UP_ROUND(params_->col_, col_tile_);
75 #endif
76 oc_res_ = params_->col_ % col_tile_;
77 }
78
InitBufferA()79 int MatmulFp32BaseCPUKernel::InitBufferA() {
80 if (a_pack_ptr_ != nullptr) {
81 return RET_OK;
82 }
83 if (!op_parameter_->is_train_session_) {
84 #ifdef ENABLE_ARM64
85 if (vec_matmul_) {
86 a_pack_ptr_ = reinterpret_cast<float *>(in_tensors().at(0)->data());
87 } else {
88 a_pack_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_a_pack_size_ * sizeof(float)));
89 }
90 #else
91 a_pack_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_a_pack_size_ * sizeof(float)));
92 #endif
93 } else {
94 a_pack_ptr_ = reinterpret_cast<float *>(workspace());
95 }
96 if (a_pack_ptr_ == nullptr) {
97 MS_LOG(ERROR) << "malloc a_pack_ptr_ failed";
98 return RET_ERROR;
99 }
100 return RET_OK;
101 }
102
InitBufferB()103 int MatmulFp32BaseCPUKernel::InitBufferB() {
104 if (b_pack_ptr_ != nullptr) {
105 return RET_OK;
106 }
107 if (op_parameter_->is_train_session_) {
108 b_pack_ptr_ = reinterpret_cast<float *>(workspace()) + matrix_a_pack_size_;
109 } else {
110 b_pack_ptr_ = reinterpret_cast<float *>(
111 ms_context_->allocator->Malloc(static_cast<size_t>(matrix_b_pack_size_) * sizeof(float)));
112 }
113 if (b_pack_ptr_ == nullptr) {
114 MS_LOG(ERROR) << "malloc b_pack_ptr_ failed";
115 return RET_ERROR;
116 }
117 return RET_OK;
118 }
119
CalBroadCastBiasDataElements()120 int MatmulFp32BaseCPUKernel::CalBroadCastBiasDataElements() {
121 lite::Tensor *bias_tensor = in_tensors_.at(2);
122 int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
123 if (!params_->b_const_) {
124 MS_LOG(WARNING) << "matmul do not support broadcast bias data";
125 } else {
126 lite::Tensor *const_tensor = in_tensors_.at(1);
127 size_t shape_size = const_tensor->shape().size();
128 if (params_->b_transpose_) {
129 MS_CHECK_TRUE_RET(shape_size >= kBiasIndex, max_bias_data);
130 max_bias_data = UP_ROUND(const_tensor->shape()[shape_size - kBiasIndex], col_tile_);
131 } else {
132 MS_CHECK_TRUE_RET(shape_size >= kWeightIndex, max_bias_data);
133 max_bias_data = UP_ROUND(const_tensor->shape()[shape_size - kWeightIndex], col_tile_);
134 }
135 }
136 return max_bias_data;
137 }
138
InitBiasData()139 int MatmulFp32BaseCPUKernel::InitBiasData() {
140 if (in_tensors_.size() == 3) {
141 auto bias_tensor = in_tensors_[2];
142 size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
143 // malloc addr need to aligned to 32 bytes
144 bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
145 if (bias_ptr_ == nullptr) {
146 MS_LOG(ERROR) << "malloc bias_ptr_ failed";
147 return RET_ERROR;
148 }
149 // whether to broadcast bias data
150 if (bias_tensor->ElementsNum() == 1) {
151 max_bias_data = CalBroadCastBiasDataElements();
152 float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data()))[0];
153 // broadcast bias data
154 for (size_t i = 0; i < max_bias_data; ++i) {
155 bias_ptr_[i] = broadcast_data;
156 }
157 } else {
158 memset(bias_ptr_, 0, max_bias_data * static_cast<int>(sizeof(float)));
159 memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast<int>(sizeof(float)));
160 }
161 }
162 return RET_OK;
163 }
164
InitMatrixA(const float * src_ptr)165 int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) {
166 CHECK_NULL_RETURN(src_ptr);
167 #ifdef ENABLE_ARM64
168 if (vec_matmul_) {
169 return RET_OK;
170 }
171 #else
172 if (vec_matmul_) {
173 memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * static_cast<int>(sizeof(float)));
174 return RET_OK;
175 }
176 #endif
177 for (int i = 0; i < params_->batch; i++) {
178 const float *src = src_ptr + i * params_->deep_ * params_->row_;
179 float *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
180 if (params_->a_transpose_) {
181 matrix_a_pack_fun_(src, dst, params_->deep_, params_->row_);
182 } else {
183 matrix_a_pack_fun_(src, dst, params_->row_, params_->deep_);
184 }
185 }
186 return RET_OK;
187 }
188
InitMatrixB(const float * src_ptr)189 int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) {
190 CHECK_NULL_RETURN(src_ptr);
191 if (vec_matmul_) {
192 for (int i = 0; i < params_->batch; i++) {
193 const float *src_data = src_ptr + i * params_->deep_ * params_->col_;
194 float *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
195 if (params_->b_transpose_) {
196 #ifdef ENABLE_AVX
197 RowMajor2Col32Major(src_data, dst, params_->deep_, params_->col_);
198 #elif defined(ENABLE_ARM64)
199 RowMajor2Col8Major(src_data, dst, params_->col_, params_->deep_);
200 #else
201 memcpy(dst, src_data, params_->col_ * params_->deep_ * static_cast<int>(sizeof(float)));
202 #endif
203 } else {
204 #ifdef ENABLE_AVX
205 RowMajor2Row32Major(src_data, dst, params_->col_, params_->deep_);
206 #elif defined(ENABLE_ARM64)
207 RowMajor2Row8Major(src_data, dst, params_->deep_, params_->col_);
208 #else
209 RowMajor2ColMajor(src_data, dst, params_->deep_, params_->col_);
210 #endif
211 }
212 }
213 return RET_OK;
214 }
215
216 for (int i = 0; i < params_->batch; i++) {
217 const float *src = src_ptr + i * params_->deep_ * params_->col_;
218 float *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
219 if (params_->b_transpose_) {
220 matrix_b_pack_fun_(src, dst, params_->col_, params_->deep_);
221 } else {
222 matrix_b_pack_fun_(src, dst, params_->deep_, params_->col_);
223 }
224 }
225 return RET_OK;
226 }
227
FreeBiasBuf()228 void MatmulFp32BaseCPUKernel::FreeBiasBuf() {
229 if (bias_ptr_ != nullptr) {
230 free(bias_ptr_);
231 bias_ptr_ = nullptr;
232 }
233 }
234
FreeResizeBufA()235 void MatmulFp32BaseCPUKernel::FreeResizeBufA() {
236 if (!op_parameter_->is_train_session_) {
237 #ifdef ENABLE_ARM64
238 if (vec_matmul_) {
239 a_pack_ptr_ = nullptr;
240 } else {
241 if (a_pack_ptr_ != nullptr) {
242 ms_context_->allocator->Free(a_pack_ptr_);
243 a_pack_ptr_ = nullptr;
244 }
245 }
246 #else
247 if (a_pack_ptr_ != nullptr) {
248 ms_context_->allocator->Free(a_pack_ptr_);
249 a_pack_ptr_ = nullptr;
250 }
251 #endif
252 } else {
253 a_pack_ptr_ = nullptr;
254 }
255 }
256
FreeResizeBufB()257 void MatmulFp32BaseCPUKernel::FreeResizeBufB() {
258 if (!op_parameter_->is_train_session_) {
259 if (b_pack_ptr_ != nullptr) {
260 ms_context_->allocator->Free(b_pack_ptr_);
261 b_pack_ptr_ = nullptr;
262 }
263 } else {
264 b_pack_ptr_ = nullptr;
265 }
266 }
267
FloatRun(int task_id) const268 int MatmulFp32BaseCPUKernel::FloatRun(int task_id) const {
269 int current_start_oc = task_id * thread_stride_ * col_tile_;
270 int current_rest_oc = 0;
271 #if defined(ENABLE_AVX)
272 if (vec_matmul_) {
273 current_rest_oc = params_->col_align_ - current_start_oc;
274 } else {
275 current_rest_oc = params_->col_ - current_start_oc;
276 }
277 #else
278 current_rest_oc = params_->col_ - current_start_oc;
279 #endif
280 int cur_oc = MSMIN(thread_stride_ * col_tile_, current_rest_oc);
281 if (cur_oc <= 0) {
282 return RET_OK;
283 }
284
285 auto b = batch_b_ptr_ + current_start_oc * params_->deep_;
286 auto c = batch_c_ptr_ + current_start_oc;
287 auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + current_start_oc;
288 if (vec_matmul_) {
289 #ifdef ENABLE_AVX
290 MatVecMulAvxFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_);
291 #elif defined(ENABLE_ARM64)
292 int rest_align_col = MSMIN(params_->col_align_ - current_start_oc, thread_stride_ * col_tile_);
293 MatVecMulFp32Neon64(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, rest_align_col);
294 #else
295 MatVecMulFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
296 #endif
297 } else {
298 MatMulOpt(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_,
299 OutType_Nhwc);
300 }
301 return RET_OK;
302 }
303
init_global_variable()304 void MatmulFp32BaseCPUKernel::init_global_variable() {
305 #ifdef ENABLE_AVX
306 matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row6Major : RowMajor2Col6Major;
307 matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col16Major : RowMajor2Row16Major;
308 row_tile_ = C6NUM;
309 col_tile_ = C16NUM;
310 #elif defined(ENABLE_ARM32)
311 matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row12Major : RowMajor2Col12Major;
312 matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col4Major : RowMajor2Row4Major;
313 row_tile_ = C12NUM;
314 col_tile_ = C4NUM;
315 #elif defined(ENABLE_SSE)
316 matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row4Major : RowMajor2Col4Major;
317 matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col8Major : RowMajor2Row8Major;
318 row_tile_ = C4NUM;
319 col_tile_ = C8NUM;
320 #else
321 matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row12Major : RowMajor2Col12Major;
322 matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col8Major : RowMajor2Row8Major;
323 row_tile_ = C12NUM;
324 col_tile_ = C8NUM;
325 #endif
326 params_->row_align_ = UP_ROUND(params_->row_, row_tile_);
327 vec_matmul_ = false;
328 }
329
Init()330 int MatmulFp32BaseCPUKernel::Init() {
331 CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
332 CHECK_LESS_RETURN(out_tensors_.size(), 1);
333 init_global_variable();
334 matrix_a_pack_size_ = params_->batch * params_->row_align_ * params_->deep_;
335 if (matrix_a_pack_size_ < 0) {
336 MS_LOG(ERROR) << "Matrix pack size is negative "
337 << "matrix_a_pack_size=" << matrix_a_pack_size_;
338 return RET_ERROR;
339 }
340 auto ret = InitBiasData();
341 if (ret != RET_OK) {
342 MS_LOG(ERROR) << "InitBiasData failed";
343 return ret;
344 }
345 if (params_->a_const_) {
346 if (RET_OK != InitBufferA()) {
347 return RET_ERROR;
348 }
349 ret = InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data()));
350 if (ret != RET_OK) {
351 MS_LOG(ERROR) << "InitMatrixA failed!";
352 return ret;
353 }
354 }
355 if (params_->b_const_) {
356 // only copy weight data
357 // resize or run to pack
358 auto b_tensor = in_tensors_.at(1);
359 src_b_ = reinterpret_cast<float *>(
360 malloc(params_->batch * params_->deep_ * params_->col_ * static_cast<int>(sizeof(float))));
361 if (src_b_ == nullptr) {
362 MS_LOG(ERROR) << "matmul fp16 src_b_ is failed!";
363 return RET_ERROR;
364 }
365 memcpy(src_b_, b_tensor->data(), params_->batch * params_->deep_ * params_->col_ * static_cast<int>(sizeof(float)));
366 }
367 return RET_OK;
368 }
369
FreeBuffSrcB()370 void MatmulFp32BaseCPUKernel::FreeBuffSrcB() {
371 if (src_b_ != nullptr) {
372 free(src_b_);
373 src_b_ = nullptr;
374 }
375 }
376
ReSize()377 int MatmulFp32BaseCPUKernel::ReSize() {
378 ResizeParameter();
379 matrix_a_pack_size_ = params_->batch * params_->row_align_ * params_->deep_;
380 matrix_b_pack_size_ = params_->batch * params_->col_align_ * params_->deep_;
381 if (matrix_a_pack_size_ < 0 || matrix_b_pack_size_ < 0) {
382 MS_LOG(ERROR) << "Matrix pack size is negative "
383 << "matrix_a_pack_size=" << matrix_a_pack_size_ << "matrix_b_pack_size=" << matrix_b_pack_size_;
384 return RET_ERROR;
385 }
386 if (op_parameter_->is_train_session_) {
387 set_workspace_size((matrix_a_pack_size_ + matrix_b_pack_size_) * static_cast<int>(sizeof(float)));
388 }
389
390 if (params_->b_const_ && src_b_ != nullptr) {
391 if (InitBufferB() != RET_OK) {
392 FreeBuffSrcB();
393 return RET_ERROR;
394 }
395 if (InitMatrixB(src_b_) != RET_OK) {
396 FreeBuffSrcB();
397 MS_LOG(ERROR) << "InitMatrixB failed!";
398 return RET_ERROR;
399 }
400 FreeBuffSrcB();
401 }
402 thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_align_, col_tile_));
403 #if defined(ENABLE_AVX)
404 if (vec_matmul_) {
405 thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_ * C4NUM), thread_count_) * C4NUM;
406 } else {
407 thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_);
408 }
409 #else
410 thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_);
411 #endif
412 return RET_OK;
413 }
414
InitTmpOutBuffer()415 int MatmulFp32BaseCPUKernel::InitTmpOutBuffer() {
416 auto out_data = reinterpret_cast<float *>(out_tensors_.front()->data());
417 MS_ASSERT(out_data != nullptr);
418 #ifdef ENABLE_AVX
419 if (oc_res_ != 0 && vec_matmul_) { // vec matmul need to malloc dst
420 int out_channel = params_->col_;
421 int oc_block_num = UP_DIV(out_channel, col_tile_);
422 MS_ASSERT(ms_context_->allocator != nullptr);
423 output_data_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(
424 params_->batch * params_->row_ * oc_block_num * col_tile_ * static_cast<int>(sizeof(float))));
425 if (output_data_ == nullptr) {
426 MS_LOG(ERROR) << "malloc tmp output data failed.";
427 return RET_NULL_PTR;
428 }
429 } else { // need to malloc dst to algin block
430 output_data_ = out_data;
431 }
432 #else
433 output_data_ = out_data;
434 #endif
435 return RET_OK;
436 }
437
Run()438 int MatmulFp32BaseCPUKernel::Run() {
439 if (!params_->a_const_) {
440 auto a_ptr = reinterpret_cast<float *>(in_tensors_[0]->data());
441 CHECK_NULL_RETURN(a_ptr);
442 if (RET_OK != InitBufferA()) {
443 return RET_ERROR;
444 }
445 auto ret = InitMatrixA(a_ptr);
446 if (ret != RET_OK) {
447 MS_LOG(ERROR) << "InitMatrixA failed!";
448 return ret;
449 }
450 }
451 if (!params_->b_const_) {
452 auto b_ptr = reinterpret_cast<float *>(in_tensors_[1]->data());
453 CHECK_NULL_RETURN(b_ptr);
454 if (RET_OK != InitBufferB()) {
455 FreeResizeBufA();
456 return RET_ERROR;
457 }
458 auto ret = InitMatrixB(b_ptr);
459 if (ret != RET_OK) {
460 MS_LOG(ERROR) << "InitMatrixB failed!";
461 return ret;
462 }
463 }
464
465 auto ret = InitTmpOutBuffer();
466 if (ret != RET_OK) {
467 FreeResizeBufA();
468 FreeResizeBufB();
469 MS_LOG(ERROR) << "InitTmpOutBuffer error!";
470 return ret;
471 }
472
473 for (int i = 0; i < params_->batch; ++i) {
474 batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_;
475 batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
476 if (vec_matmul_) {
477 batch_c_ptr_ = output_data_ + i * params_->row_ * params_->col_align_;
478 } else {
479 // need not aligned
480 batch_c_ptr_ = output_data_ + i * params_->row_ * params_->col_;
481 }
482 ret = ParallelLaunch(this->ms_context_, MatmulBaseFloatRun, this, thread_count_);
483 if (ret != RET_OK) {
484 MS_LOG(ERROR) << "MatmulBaseFloatRun failed";
485 }
486 }
487
488 #ifdef ENABLE_AVX
489 if (oc_res_ != 0 && vec_matmul_) {
490 auto out_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
491 PackNHWCXToNHWCFp32(output_data_, out_data, params_->batch, params_->row_, params_->col_, col_tile_);
492 ms_context_->allocator->Free(output_data_);
493 output_data_ = nullptr;
494 }
495 #endif
496 if (!params_->a_const_) {
497 FreeResizeBufA();
498 }
499
500 if (!params_->b_const_) {
501 FreeResizeBufB();
502 }
503 return ret;
504 }
505 } // namespace mindspore::kernel
506