• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h"
18 #include "nnacl/fp16/matmul_fp16.h"
19 #include "nnacl/fp16/cast_fp16.h"
20 #include "include/errorcode.h"
21 
22 using mindspore::lite::RET_ERROR;
23 using mindspore::lite::RET_INPUT_TENSOR_ERROR;
24 using mindspore::lite::RET_MEMORY_FAILED;
25 using mindspore::lite::RET_OK;
26 
27 namespace mindspore::kernel {
MatmulBaseFP16Run(void * cdata,int task_id,float lhs_scale,float rhs_scale)28 int MatmulBaseFP16Run(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
29   CHECK_NULL_RETURN(cdata);
30   auto op = reinterpret_cast<MatmulBaseFP16CPUKernel *>(cdata);
31   auto error_code = op->RunImpl(task_id);
32   if (error_code != RET_OK) {
33     MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]";
34     return RET_ERROR;
35   }
36   return RET_OK;
37 }
38 
~MatmulBaseFP16CPUKernel()39 MatmulBaseFP16CPUKernel::~MatmulBaseFP16CPUKernel() {
40   if (src_b_ != nullptr) {
41     free(src_b_);
42     src_b_ = nullptr;
43   }
44   if (bias_ptr_ != nullptr) {
45     free(bias_ptr_);
46     bias_ptr_ = nullptr;
47   }
48   FreeResizeBufA();
49   FreeResizeBufB();
50 }
51 
FreeResizeBufA()52 void MatmulBaseFP16CPUKernel::FreeResizeBufA() {
53   if (a_pack_ptr_ != nullptr) {
54     ms_context_->allocator->Free(a_pack_ptr_);
55     a_pack_ptr_ = nullptr;
56   }
57   return;
58 }
59 
FreeResizeBufB()60 void MatmulBaseFP16CPUKernel::FreeResizeBufB() {
61   if (b_pack_ptr_ != nullptr) {
62     ms_context_->allocator->Free(b_pack_ptr_);
63     b_pack_ptr_ = nullptr;
64   }
65   return;
66 }
67 
InitParameter()68 void MatmulBaseFP16CPUKernel::InitParameter() {
69   NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[0]);
70   NNACL_CHECK_NULL_RETURN_VOID(in_tensors_[1]);
71   params_->a_const_ = (in_tensors_[0]->data() != nullptr);
72   params_->b_const_ = (in_tensors_[1]->data() != nullptr);
73 }
74 
InitBias()75 int MatmulBaseFP16CPUKernel::InitBias() {
76   if (params_->col_ != 0 && bias_ptr_ == nullptr) {
77     int max_bias_data = UP_ROUND(params_->col_, C16NUM);
78     bias_ptr_ = reinterpret_cast<float16_t *>(malloc(max_bias_data * sizeof(float16_t)));
79     if (bias_ptr_ == nullptr) {
80       MS_LOG(ERROR) << "malloc bias_ptr_ failed";
81       return RET_ERROR;
82     }
83     if (in_tensors_.size() == 3) {
84       auto bias_tensor = in_tensors_[2];
85       CHECK_NULL_RETURN(bias_tensor);
86       memcpy(bias_ptr_, bias_tensor->data(), bias_tensor->ElementsNum() * sizeof(float16_t));
87     } else {
88       memset(bias_ptr_, 0, max_bias_data * sizeof(float16_t));
89     }
90   }
91   return RET_OK;
92 }
93 
ReSize()94 int MatmulBaseFP16CPUKernel::ReSize() {
95   ResizeParameter();
96 
97   if (params_->b_const_ == true && src_b_ != nullptr) {
98     InitBufferB();
99     InitMatrixB(src_b_, kNumberTypeFloat16);
100     free(src_b_);
101     src_b_ = nullptr;
102   }
103   if (vec_matmul_) {
104 #ifdef ENABLE_ARM64
105     thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C16NUM));
106     thread_stride_ = UP_DIV(UP_DIV(params_->col_, C16NUM), thread_count_) * C16NUM;
107 #else
108     thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM));
109     thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;
110 #endif
111   } else {
112     thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM));
113     thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;
114   }
115   return RET_OK;
116 }
117 
ResizeParameter()118 void MatmulBaseFP16CPUKernel::ResizeParameter() {
119   if (params_->row_ == 1) {
120     vec_matmul_ = true;
121   }
122 
123   if (vec_matmul_) {
124     params_->row_align_ = 1;
125 #ifdef ENABLE_ARM64
126     params_->col_align_ = UP_ROUND(params_->col_, C16NUM);
127 #else
128     params_->col_align_ = params_->col_;
129 #endif
130   } else {
131     params_->row_align_ = UP_ROUND(params_->row_, row_tile_);
132     params_->col_align_ = UP_ROUND(params_->col_, C8NUM);
133   }
134 }
135 
InitBufferA()136 int MatmulBaseFP16CPUKernel::InitBufferA() {
137   a_pack_ptr_ = reinterpret_cast<float16_t *>(
138     ms_context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t)));
139   if (a_pack_ptr_ == nullptr) {
140     return RET_MEMORY_FAILED;
141   }
142 
143   memset(a_pack_ptr_, 0, params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t));
144   return RET_OK;
145 }
146 
InitBufferB()147 int MatmulBaseFP16CPUKernel::InitBufferB() {
148   if (b_pack_ptr_ != nullptr) {
149     return RET_OK;
150   }
151 
152   b_pack_ptr_ = reinterpret_cast<float16_t *>(
153     ms_context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t)));
154   if (b_pack_ptr_ == nullptr) {
155     return RET_MEMORY_FAILED;
156   }
157 
158   memset(b_pack_ptr_, 0, params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t));
159   return RET_OK;
160 }
161 
InitMatrixA(const void * src_ptr)162 void MatmulBaseFP16CPUKernel::InitMatrixA(const void *src_ptr) {
163   NNACL_CHECK_NULL_RETURN_VOID(src_ptr);
164   auto src_data_type = in_tensors_[0]->data_type();
165 
166   if (vec_matmul_) {
167     if (src_data_type == kNumberTypeFloat32) {
168       Float32ToFloat16(reinterpret_cast<const float *>(src_ptr), a_pack_ptr_, params_->batch * params_->deep_);
169     } else {
170       memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * sizeof(float16_t));
171     }
172     return;
173   }
174 
175   const int8_t *int8_src = reinterpret_cast<const int8_t *>(src_ptr);
176   for (int i = 0; i < params_->batch; i++) {
177     const int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type);
178     float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
179     if (params_->a_transpose_) {
180 #ifdef ENABLE_ARM64
181       RowMajor2RowNMajorFp16((const float16_t *)src, dst, params_->deep_, params_->row_);
182 #else
183       RowMajor2Row12MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32);
184 #endif
185     } else {
186 #ifdef ENABLE_ARM64
187       RowMajor2ColNMajorFp16((const float16_t *)src, dst, params_->row_, params_->deep_);
188 #else
189       RowMajor2Col12MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32);
190 #endif
191     }
192   }
193   return;
194 }
195 
InitMatrixB(const void * src_ptr,TypeId src_data_type)196 void MatmulBaseFP16CPUKernel::InitMatrixB(const void *src_ptr, TypeId src_data_type) {
197   NNACL_CHECK_NULL_RETURN_VOID(src_ptr);
198   const int8_t *int8_src = reinterpret_cast<const int8_t *>(src_ptr);
199 
200   if (vec_matmul_) {
201     if (params_->b_transpose_) {
202       if (src_data_type == kNumberTypeFloat32) {
203         Float32ToFloat16(reinterpret_cast<const float *>(src_ptr), b_pack_ptr_,
204                          params_->batch * params_->col_ * params_->deep_);
205       } else {
206 #ifdef ENABLE_ARM64
207         for (auto i = 0; i < params_->batch; ++i) {
208           const auto *b_src = reinterpret_cast<const float16_t *>(src_ptr) + i * params_->col_align_ * params_->deep_;
209           auto *dst = b_pack_ptr_ + i * params_->col_align_ * params_->deep_;
210           RowMajor2Col16MajorFp16Opt(b_src, dst, params_->col_, params_->deep_);
211         }
212 #else
213         memcpy(b_pack_ptr_, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t));
214 #endif
215       }
216     } else {
217       for (int i = 0; i < params_->batch; i++) {
218 #ifdef ENABLE_ARM64
219         const auto *b_src = reinterpret_cast<const float16_t *>(src_ptr) + i * params_->col_align_ * params_->deep_;
220         auto *dst = b_pack_ptr_ + i * params_->col_align_ * params_->deep_;
221         RowMajor2Row16MajorFp16Opt(b_src, dst, params_->deep_, params_->col_);
222 #else
223         const int8_t *batch_src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
224         float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_;
225         RowMajor2ColMajorFp16(batch_src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32);
226 #endif
227       }
228     }
229     return;
230   }
231 
232   for (int i = 0; i < params_->batch; i++) {
233     const int8_t *src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
234     float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
235     if (params_->b_transpose_) {
236       RowMajor2Col8MajorFp16(src, dst, params_->col_, params_->deep_, src_data_type == kNumberTypeFloat32);
237     } else {
238       RowMajor2Row8MajorFp16(src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32);
239     }
240   }
241   return;
242 }
243 
Init()244 int MatmulBaseFP16CPUKernel::Init() {
245   CHECK_LESS_RETURN(in_tensors_.size(), 2);
246   CHECK_LESS_RETURN(out_tensors_.size(), 1);
247   ResizeParameter();
248   if (params_->a_const_ == true) {
249     if (RET_OK != InitBufferA()) {
250       return RET_ERROR;
251     }
252     MS_ASSERT(in_tensors_[0] != nullptr);
253     MS_ASSERT(in_tensors_[0]->data() != nullptr);
254     InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data()));
255   }
256 
257   if (params_->b_const_ == true) {
258     /* copy origin b data, pack in resize
259      * pack after a infershape done */
260     auto b_tensor = in_tensors_[1];
261     MS_ASSERT(b_tensor != nullptr);
262     MS_ASSERT(b_tensor->data() != nullptr);
263     src_b_ = reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t)));
264     if (src_b_ == nullptr) {
265       MS_LOG(ERROR) << "Matmul fp16 malloc src_b_ failed";
266       return RET_ERROR;
267     }
268 
269     if (b_tensor->data_type() == kNumberTypeFloat32) {
270       Float32ToFloat16(reinterpret_cast<float *>(b_tensor->data()), src_b_,
271                        params_->batch * params_->col_ * params_->deep_);
272     } else {
273       memcpy(src_b_, b_tensor->data(), params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t));
274     }
275   }
276 
277   auto ret = InitBias();
278   if (ret != RET_OK) {
279     MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed";
280     return RET_ERROR;
281   }
282   return RET_OK;
283 }
284 
RunImpl(int task_id)285 int MatmulBaseFP16CPUKernel::RunImpl(int task_id) {
286   int cur_stride = params_->col_ - task_id * thread_stride_;
287   int cur_oc = MSMIN(thread_stride_, cur_stride);
288   if (cur_oc <= 0) {
289     return RET_OK;
290   }
291 
292   auto bias = bias_ptr_ + thread_stride_ * task_id;
293   auto b = batch_b_ptr_ + task_id * thread_stride_ * params_->deep_;
294   auto c = batch_c_ptr_ + task_id * thread_stride_;
295 
296   if (vec_matmul_) {
297 #ifdef ENABLE_ARM64
298     VecMatmulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
299 #else
300     MatVecMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
301 #endif
302   } else {
303 #ifdef ENABLE_ARM64
304     MatmulBaseFp16Neon(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc,
305                        params_->col_, OutType_Nhwc);
306 #else
307     MatMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_,
308                OutType_Nhwc);
309 #endif
310   }
311   return RET_OK;
312 }
313 
Run()314 int MatmulBaseFP16CPUKernel::Run() {
315   auto c_ptr = reinterpret_cast<float16_t *>(out_tensors_[0]->data());
316   CHECK_NULL_RETURN(c_ptr);
317 
318   if ((params_->a_const_ == false) || IsRepack()) {
319     if (RET_OK != InitBufferA()) {
320       return RET_ERROR;
321     }
322     InitMatrixA(in_tensors_[0]->data());
323   }
324   if ((params_->b_const_ == false) || IsRepack()) {
325     if (RET_OK != InitBufferB()) {
326       FreeResizeBufA();
327       return RET_ERROR;
328     }
329     InitMatrixB(in_tensors_[1]->data(), in_tensors_[1]->data_type());
330     InitBias();
331   }
332 
333   for (int i = 0; i < params_->batch; ++i) {
334     if (vec_matmul_) {
335       batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_;
336 #ifdef ENABLE_ARM64
337       batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
338 #else
339       batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_;
340 #endif
341       batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
342     } else {
343       batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_;
344       batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
345       batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
346     }
347     auto ret = ParallelLaunch(this->ms_context_, MatmulBaseFP16Run, this, thread_count_);
348     if (ret != RET_OK) {
349       MS_LOG(ERROR) << "MatmulBaseFloatRun failed";
350       return ret;
351     }
352   }
353 
354   if (params_->a_const_ == false) {
355     FreeResizeBufA();
356   }
357 
358   if (params_->b_const_ == false) {
359     FreeResizeBufB();
360   }
361   return RET_OK;
362 }
363 }  // namespace mindspore::kernel
364