• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/runtime/kernel/arm/fp32/matmul_fp32_base.h"
18 #include "nnacl/fp32/matmul_fp32.h"
19 #include "nnacl/fp32/pack_fp32.h"
20 
21 using mindspore::lite::RET_NULL_PTR;
22 
23 namespace mindspore::kernel {
MatmulBaseFloatRun(void * cdata,int task_id,float lhs_scale,float rhs_scale)24 int MatmulBaseFloatRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
25   CHECK_NULL_RETURN(cdata);
26   auto op = reinterpret_cast<MatmulFp32BaseCPUKernel *>(cdata);
27   auto error_code = op->FloatRun(task_id);
28   if (error_code != RET_OK) {
29     MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
30     return RET_ERROR;
31   }
32   return RET_OK;
33 }
34 
~MatmulFp32BaseCPUKernel()35 MatmulFp32BaseCPUKernel::~MatmulFp32BaseCPUKernel() {
36   FreeResizeBufA();
37   FreeResizeBufB();
38   FreeBiasBuf();
39   FreeBuffSrcB();
40 }
41 
InitParameter()42 void MatmulFp32BaseCPUKernel::InitParameter() {
43   NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[kInputIndex]);
44   NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[kWeightIndex]);
45   params_->a_const_ = (in_tensors_[kInputIndex]->data() != nullptr);
46   params_->b_const_ = (in_tensors_[kWeightIndex]->data() != nullptr);
47 
48   if (op_parameter_->is_train_session_) {
49     params_->a_const_ = false;
50     params_->b_const_ = false;
51   }
52 }
53 
ResizeParameter()54 void MatmulFp32BaseCPUKernel::ResizeParameter() {
55   init_global_variable();
56   if (params_->row_ == 1) {
57     vec_matmul_ = true;
58 #ifdef ENABLE_AVX
59     // vector matmul col is aligned to C8NUM in avx
60     col_tile_ = C8NUM;
61 #elif defined(ENABLE_ARM64)
62     col_tile_ = C8NUM;
63 #endif
64     row_tile_ = 1;
65   }
66   params_->row_align_ = UP_ROUND(params_->row_, row_tile_);
67 #ifdef ENABLE_AVX
68   // avx is aligned to col_tile_
69   params_->col_align_ = UP_ROUND(params_->col_, col_tile_);
70 #elif defined(ENABLE_ARM64)
71   // no matter vec_matmul_ or not, use col_tile_ to get col_align_
72   params_->col_align_ = UP_ROUND(params_->col_, col_tile_);
73 #else
74   params_->col_align_ = vec_matmul_ ? params_->col_ : UP_ROUND(params_->col_, col_tile_);
75 #endif
76   oc_res_ = params_->col_ % col_tile_;
77 }
78 
InitBufferA()79 int MatmulFp32BaseCPUKernel::InitBufferA() {
80   if (a_pack_ptr_ != nullptr) {
81     return RET_OK;
82   }
83   if (!op_parameter_->is_train_session_) {
84 #ifdef ENABLE_ARM64
85     if (vec_matmul_) {
86       a_pack_ptr_ = reinterpret_cast<float *>(in_tensors().at(0)->data());
87     } else {
88       a_pack_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_a_pack_size_ * sizeof(float)));
89     }
90 #else
91     a_pack_ptr_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(matrix_a_pack_size_ * sizeof(float)));
92 #endif
93   } else {
94     a_pack_ptr_ = reinterpret_cast<float *>(workspace());
95   }
96   if (a_pack_ptr_ == nullptr) {
97     MS_LOG(ERROR) << "malloc a_pack_ptr_ failed";
98     return RET_ERROR;
99   }
100   return RET_OK;
101 }
102 
InitBufferB()103 int MatmulFp32BaseCPUKernel::InitBufferB() {
104   if (b_pack_ptr_ != nullptr) {
105     return RET_OK;
106   }
107   if (op_parameter_->is_train_session_) {
108     b_pack_ptr_ = reinterpret_cast<float *>(workspace()) + matrix_a_pack_size_;
109   } else {
110     b_pack_ptr_ = reinterpret_cast<float *>(
111       ms_context_->allocator->Malloc(static_cast<size_t>(matrix_b_pack_size_) * sizeof(float)));
112   }
113   if (b_pack_ptr_ == nullptr) {
114     MS_LOG(ERROR) << "malloc b_pack_ptr_ failed";
115     return RET_ERROR;
116   }
117   return RET_OK;
118 }
119 
CalBroadCastBiasDataElements()120 int MatmulFp32BaseCPUKernel::CalBroadCastBiasDataElements() {
121   lite::Tensor *bias_tensor = in_tensors_.at(2);
122   int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
123   if (!params_->b_const_) {
124     MS_LOG(WARNING) << "matmul do not support broadcast bias data";
125   } else {
126     lite::Tensor *const_tensor = in_tensors_.at(1);
127     size_t shape_size = const_tensor->shape().size();
128     if (params_->b_transpose_) {
129       MS_CHECK_TRUE_RET(shape_size >= kBiasIndex, max_bias_data);
130       max_bias_data = UP_ROUND(const_tensor->shape()[shape_size - kBiasIndex], col_tile_);
131     } else {
132       MS_CHECK_TRUE_RET(shape_size >= kWeightIndex, max_bias_data);
133       max_bias_data = UP_ROUND(const_tensor->shape()[shape_size - kWeightIndex], col_tile_);
134     }
135   }
136   return max_bias_data;
137 }
138 
InitBiasData()139 int MatmulFp32BaseCPUKernel::InitBiasData() {
140   if (in_tensors_.size() == 3) {
141     auto bias_tensor = in_tensors_[2];
142     size_t max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), col_tile_);
143     // malloc addr need to aligned to 32 bytes
144     bias_ptr_ = reinterpret_cast<float *>(malloc(max_bias_data * static_cast<int>(sizeof(float))));
145     if (bias_ptr_ == nullptr) {
146       MS_LOG(ERROR) << "malloc bias_ptr_ failed";
147       return RET_ERROR;
148     }
149     // whether to broadcast bias data
150     if (bias_tensor->ElementsNum() == 1) {
151       max_bias_data = CalBroadCastBiasDataElements();
152       float broadcast_data = (reinterpret_cast<float *>(bias_tensor->data()))[0];
153       // broadcast bias data
154       for (size_t i = 0; i < max_bias_data; ++i) {
155         bias_ptr_[i] = broadcast_data;
156       }
157     } else {
158       memset(bias_ptr_, 0, max_bias_data * static_cast<int>(sizeof(float)));
159       memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * static_cast<int>(sizeof(float)));
160     }
161   }
162   return RET_OK;
163 }
164 
InitMatrixA(const float * src_ptr)165 int MatmulFp32BaseCPUKernel::InitMatrixA(const float *src_ptr) {
166   CHECK_NULL_RETURN(src_ptr);
167 #ifdef ENABLE_ARM64
168   if (vec_matmul_) {
169     return RET_OK;
170   }
171 #else
172   if (vec_matmul_) {
173     memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * static_cast<int>(sizeof(float)));
174     return RET_OK;
175   }
176 #endif
177   for (int i = 0; i < params_->batch; i++) {
178     const float *src = src_ptr + i * params_->deep_ * params_->row_;
179     float *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
180     if (params_->a_transpose_) {
181       matrix_a_pack_fun_(src, dst, params_->deep_, params_->row_);
182     } else {
183       matrix_a_pack_fun_(src, dst, params_->row_, params_->deep_);
184     }
185   }
186   return RET_OK;
187 }
188 
InitMatrixB(const float * src_ptr)189 int MatmulFp32BaseCPUKernel::InitMatrixB(const float *src_ptr) {
190   CHECK_NULL_RETURN(src_ptr);
191   if (vec_matmul_) {
192     for (int i = 0; i < params_->batch; i++) {
193       const float *src_data = src_ptr + i * params_->deep_ * params_->col_;
194       float *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
195       if (params_->b_transpose_) {
196 #ifdef ENABLE_AVX
197         RowMajor2Col32Major(src_data, dst, params_->deep_, params_->col_);
198 #elif defined(ENABLE_ARM64)
199         RowMajor2Col8Major(src_data, dst, params_->col_, params_->deep_);
200 #else
201         memcpy(dst, src_data, params_->col_ * params_->deep_ * static_cast<int>(sizeof(float)));
202 #endif
203       } else {
204 #ifdef ENABLE_AVX
205         RowMajor2Row32Major(src_data, dst, params_->col_, params_->deep_);
206 #elif defined(ENABLE_ARM64)
207         RowMajor2Row8Major(src_data, dst, params_->deep_, params_->col_);
208 #else
209         RowMajor2ColMajor(src_data, dst, params_->deep_, params_->col_);
210 #endif
211       }
212     }
213     return RET_OK;
214   }
215 
216   for (int i = 0; i < params_->batch; i++) {
217     const float *src = src_ptr + i * params_->deep_ * params_->col_;
218     float *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
219     if (params_->b_transpose_) {
220       matrix_b_pack_fun_(src, dst, params_->col_, params_->deep_);
221     } else {
222       matrix_b_pack_fun_(src, dst, params_->deep_, params_->col_);
223     }
224   }
225   return RET_OK;
226 }
227 
FreeBiasBuf()228 void MatmulFp32BaseCPUKernel::FreeBiasBuf() {
229   if (bias_ptr_ != nullptr) {
230     free(bias_ptr_);
231     bias_ptr_ = nullptr;
232   }
233 }
234 
FreeResizeBufA()235 void MatmulFp32BaseCPUKernel::FreeResizeBufA() {
236   if (!op_parameter_->is_train_session_) {
237 #ifdef ENABLE_ARM64
238     if (vec_matmul_) {
239       a_pack_ptr_ = nullptr;
240     } else {
241       if (a_pack_ptr_ != nullptr) {
242         ms_context_->allocator->Free(a_pack_ptr_);
243         a_pack_ptr_ = nullptr;
244       }
245     }
246 #else
247     if (a_pack_ptr_ != nullptr) {
248       ms_context_->allocator->Free(a_pack_ptr_);
249       a_pack_ptr_ = nullptr;
250     }
251 #endif
252   } else {
253     a_pack_ptr_ = nullptr;
254   }
255 }
256 
FreeResizeBufB()257 void MatmulFp32BaseCPUKernel::FreeResizeBufB() {
258   if (!op_parameter_->is_train_session_) {
259     if (b_pack_ptr_ != nullptr) {
260       ms_context_->allocator->Free(b_pack_ptr_);
261       b_pack_ptr_ = nullptr;
262     }
263   } else {
264     b_pack_ptr_ = nullptr;
265   }
266 }
267 
FloatRun(int task_id) const268 int MatmulFp32BaseCPUKernel::FloatRun(int task_id) const {
269   int current_start_oc = task_id * thread_stride_ * col_tile_;
270   int current_rest_oc = 0;
271 #if defined(ENABLE_AVX)
272   if (vec_matmul_) {
273     current_rest_oc = params_->col_align_ - current_start_oc;
274   } else {
275     current_rest_oc = params_->col_ - current_start_oc;
276   }
277 #else
278   current_rest_oc = params_->col_ - current_start_oc;
279 #endif
280   int cur_oc = MSMIN(thread_stride_ * col_tile_, current_rest_oc);
281   if (cur_oc <= 0) {
282     return RET_OK;
283   }
284 
285   auto b = batch_b_ptr_ + current_start_oc * params_->deep_;
286   auto c = batch_c_ptr_ + current_start_oc;
287   auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + current_start_oc;
288   if (vec_matmul_) {
289 #ifdef ENABLE_AVX
290     MatVecMulAvxFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, params_->col_align_);
291 #elif defined(ENABLE_ARM64)
292     int rest_align_col = MSMIN(params_->col_align_ - current_start_oc, thread_stride_ * col_tile_);
293     MatVecMulFp32Neon64(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc, rest_align_col);
294 #else
295     MatVecMulFp32(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
296 #endif
297   } else {
298     MatMulOpt(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_,
299               OutType_Nhwc);
300   }
301   return RET_OK;
302 }
303 
init_global_variable()304 void MatmulFp32BaseCPUKernel::init_global_variable() {
305 #ifdef ENABLE_AVX
306   matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row6Major : RowMajor2Col6Major;
307   matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col16Major : RowMajor2Row16Major;
308   row_tile_ = C6NUM;
309   col_tile_ = C16NUM;
310 #elif defined(ENABLE_ARM32)
311   matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row12Major : RowMajor2Col12Major;
312   matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col4Major : RowMajor2Row4Major;
313   row_tile_ = C12NUM;
314   col_tile_ = C4NUM;
315 #elif defined(ENABLE_SSE)
316   matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row4Major : RowMajor2Col4Major;
317   matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col8Major : RowMajor2Row8Major;
318   row_tile_ = C4NUM;
319   col_tile_ = C8NUM;
320 #else
321   matrix_a_pack_fun_ = params_->a_transpose_ ? RowMajor2Row12Major : RowMajor2Col12Major;
322   matrix_b_pack_fun_ = params_->b_transpose_ ? RowMajor2Col8Major : RowMajor2Row8Major;
323   row_tile_ = C12NUM;
324   col_tile_ = C8NUM;
325 #endif
326   params_->row_align_ = UP_ROUND(params_->row_, row_tile_);
327   vec_matmul_ = false;
328 }
329 
Init()330 int MatmulFp32BaseCPUKernel::Init() {
331   CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
332   CHECK_LESS_RETURN(out_tensors_.size(), 1);
333   init_global_variable();
334   matrix_a_pack_size_ = params_->batch * params_->row_align_ * params_->deep_;
335   if (matrix_a_pack_size_ < 0) {
336     MS_LOG(ERROR) << "Matrix pack size is negative "
337                   << "matrix_a_pack_size=" << matrix_a_pack_size_;
338     return RET_ERROR;
339   }
340   auto ret = InitBiasData();
341   if (ret != RET_OK) {
342     MS_LOG(ERROR) << "InitBiasData failed";
343     return ret;
344   }
345   if (params_->a_const_) {
346     if (RET_OK != InitBufferA()) {
347       return RET_ERROR;
348     }
349     ret = InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data()));
350     if (ret != RET_OK) {
351       MS_LOG(ERROR) << "InitMatrixA failed!";
352       return ret;
353     }
354   }
355   if (params_->b_const_) {
356     // only copy weight data
357     // resize or run to pack
358     auto b_tensor = in_tensors_.at(1);
359     src_b_ = reinterpret_cast<float *>(
360       malloc(params_->batch * params_->deep_ * params_->col_ * static_cast<int>(sizeof(float))));
361     if (src_b_ == nullptr) {
362       MS_LOG(ERROR) << "matmul fp16 src_b_ is failed!";
363       return RET_ERROR;
364     }
365     memcpy(src_b_, b_tensor->data(), params_->batch * params_->deep_ * params_->col_ * static_cast<int>(sizeof(float)));
366   }
367   return RET_OK;
368 }
369 
FreeBuffSrcB()370 void MatmulFp32BaseCPUKernel::FreeBuffSrcB() {
371   if (src_b_ != nullptr) {
372     free(src_b_);
373     src_b_ = nullptr;
374   }
375 }
376 
ReSize()377 int MatmulFp32BaseCPUKernel::ReSize() {
378   ResizeParameter();
379   matrix_a_pack_size_ = params_->batch * params_->row_align_ * params_->deep_;
380   matrix_b_pack_size_ = params_->batch * params_->col_align_ * params_->deep_;
381   if (matrix_a_pack_size_ < 0 || matrix_b_pack_size_ < 0) {
382     MS_LOG(ERROR) << "Matrix pack size is negative "
383                   << "matrix_a_pack_size=" << matrix_a_pack_size_ << "matrix_b_pack_size=" << matrix_b_pack_size_;
384     return RET_ERROR;
385   }
386   if (op_parameter_->is_train_session_) {
387     set_workspace_size((matrix_a_pack_size_ + matrix_b_pack_size_) * static_cast<int>(sizeof(float)));
388   }
389 
390   if (params_->b_const_ && src_b_ != nullptr) {
391     if (InitBufferB() != RET_OK) {
392       FreeBuffSrcB();
393       return RET_ERROR;
394     }
395     if (InitMatrixB(src_b_) != RET_OK) {
396       FreeBuffSrcB();
397       MS_LOG(ERROR) << "InitMatrixB failed!";
398       return RET_ERROR;
399     }
400     FreeBuffSrcB();
401   }
402   thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_align_, col_tile_));
403 #if defined(ENABLE_AVX)
404   if (vec_matmul_) {
405     thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_ * C4NUM), thread_count_) * C4NUM;
406   } else {
407     thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_);
408   }
409 #else
410   thread_stride_ = UP_DIV(UP_DIV(params_->col_align_, col_tile_), thread_count_);
411 #endif
412   return RET_OK;
413 }
414 
InitTmpOutBuffer()415 int MatmulFp32BaseCPUKernel::InitTmpOutBuffer() {
416   auto out_data = reinterpret_cast<float *>(out_tensors_.front()->data());
417   MS_ASSERT(out_data != nullptr);
418 #ifdef ENABLE_AVX
419   if (oc_res_ != 0 && vec_matmul_) {  // vec matmul need to malloc dst
420     int out_channel = params_->col_;
421     int oc_block_num = UP_DIV(out_channel, col_tile_);
422     MS_ASSERT(ms_context_->allocator != nullptr);
423     output_data_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(
424       params_->batch * params_->row_ * oc_block_num * col_tile_ * static_cast<int>(sizeof(float))));
425     if (output_data_ == nullptr) {
426       MS_LOG(ERROR) << "malloc tmp output data failed.";
427       return RET_NULL_PTR;
428     }
429   } else {  // need to malloc dst to algin block
430     output_data_ = out_data;
431   }
432 #else
433   output_data_ = out_data;
434 #endif
435   return RET_OK;
436 }
437 
Run()438 int MatmulFp32BaseCPUKernel::Run() {
439   if (!params_->a_const_) {
440     auto a_ptr = reinterpret_cast<float *>(in_tensors_[0]->data());
441     CHECK_NULL_RETURN(a_ptr);
442     if (RET_OK != InitBufferA()) {
443       return RET_ERROR;
444     }
445     auto ret = InitMatrixA(a_ptr);
446     if (ret != RET_OK) {
447       MS_LOG(ERROR) << "InitMatrixA failed!";
448       return ret;
449     }
450   }
451   if (!params_->b_const_) {
452     auto b_ptr = reinterpret_cast<float *>(in_tensors_[1]->data());
453     CHECK_NULL_RETURN(b_ptr);
454     if (RET_OK != InitBufferB()) {
455       FreeResizeBufA();
456       return RET_ERROR;
457     }
458     auto ret = InitMatrixB(b_ptr);
459     if (ret != RET_OK) {
460       MS_LOG(ERROR) << "InitMatrixB failed!";
461       return ret;
462     }
463   }
464 
465   auto ret = InitTmpOutBuffer();
466   if (ret != RET_OK) {
467     FreeResizeBufA();
468     FreeResizeBufB();
469     MS_LOG(ERROR) << "InitTmpOutBuffer error!";
470     return ret;
471   }
472 
473   for (int i = 0; i < params_->batch; ++i) {
474     batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_;
475     batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
476     if (vec_matmul_) {
477       batch_c_ptr_ = output_data_ + i * params_->row_ * params_->col_align_;
478     } else {
479       // need not aligned
480       batch_c_ptr_ = output_data_ + i * params_->row_ * params_->col_;
481     }
482     ret = ParallelLaunch(this->ms_context_, MatmulBaseFloatRun, this, thread_count_);
483     if (ret != RET_OK) {
484       MS_LOG(ERROR) << "MatmulBaseFloatRun failed";
485     }
486   }
487 
488 #ifdef ENABLE_AVX
489   if (oc_res_ != 0 && vec_matmul_) {
490     auto out_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
491     PackNHWCXToNHWCFp32(output_data_, out_data, params_->batch, params_->row_, params_->col_, col_tile_);
492     ms_context_->allocator->Free(output_data_);
493     output_data_ = nullptr;
494   }
495 #endif
496   if (!params_->a_const_) {
497     FreeResizeBufA();
498   }
499 
500   if (!params_->b_const_) {
501     FreeResizeBufB();
502   }
503   return ret;
504 }
505 }  // namespace mindspore::kernel
506