1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "src/runtime/kernel/arm/fp16/stack_fp16.h"
17 #include <vector>
18 #include "schema/model_generated.h"
19 #include "src/kernel_registry.h"
20 #include "nnacl/stack_parameter.h"
21 #include "include/errorcode.h"
22 #include "src/runtime/kernel/arm/fp16/common_fp16.h"
23 #include "nnacl/fp16/cast_fp16.h"
24 #include "nnacl/base/stack_base.h"
25 #include "nnacl/errorcode.h"
26
27 using mindspore::lite::KernelRegistrar;
28 using mindspore::lite::RET_ERROR;
29 using mindspore::lite::RET_OK;
30 using mindspore::schema::PrimitiveType_Stack;
31
32 namespace mindspore::kernel {
33 namespace {
34 constexpr int kStackStep = 64;
35 } // namespace
36
InitMallocFlags()37 void StackFp16CPUKernel::InitMallocFlags() {
38 malloc_buffers_.resize(in_tensors_.size());
39 for (size_t i = 0; i < in_tensors_.size(); ++i) {
40 malloc_buffers_.at(i) = in_tensors_.at(i)->data_type() == kNumberTypeFloat32;
41 }
42 malloc_out_ = out_tensors_.at(0)->data_type() == kNumberTypeFloat32;
43 }
44
MallocAssignBuffer()45 int StackFp16CPUKernel::MallocAssignBuffer() {
46 buffers_.resize(in_tensors_.size(), nullptr);
47 for (size_t i = 0; i < in_tensors_.size(); ++i) {
48 buffers_.at(i) = reinterpret_cast<void *>(
49 ConvertInputFp32toFp16(in_tensors_.at(i), static_cast<const lite::InnerContext *>(ms_context_)));
50 if (buffers_.at(i) == nullptr) {
51 return RET_ERROR;
52 }
53 }
54
55 out_buffer_ = nullptr;
56 out_buffer_ = MallocOutputFp16(out_tensors_.at(0), static_cast<const lite::InnerContext *>(this->ms_context_));
57 if (out_buffer_ == nullptr) {
58 return RET_ERROR;
59 }
60 return RET_OK;
61 }
62
FreeBuffer()63 void StackFp16CPUKernel::FreeBuffer() {
64 for (size_t i = 0; i < buffers_.size(); ++i) {
65 if (malloc_buffers_.at(i) && buffers_.at(i) != nullptr) {
66 ms_context_->allocator->Free(buffers_.at(i));
67 buffers_.at(i) = nullptr;
68 }
69 }
70 if (malloc_out_ && out_buffer_ != nullptr) {
71 ms_context_->allocator->Free(out_buffer_);
72 out_buffer_ = nullptr;
73 }
74 }
75
Init()76 int StackFp16CPUKernel::Init() {
77 CHECK_LESS_RETURN(in_tensors_.size(), 1);
78 CHECK_LESS_RETURN(out_tensors_.size(), 1);
79 data_type_size_ = sizeof(float16_t);
80 if (!InferShapeDone()) {
81 return RET_OK;
82 }
83 return ReSize();
84 }
85
DoExecute(int task_id)86 int StackFp16CPUKernel::DoExecute(int task_id) {
87 auto inputs = buffers_.data();
88 void *output_data = reinterpret_cast<void *>(out_buffer_);
89 auto step = UP_DIV(outer_size_, num_threads_);
90 MS_CHECK_FALSE(INT_MUL_OVERFLOW(task_id, step), RET_ERROR);
91 auto start = task_id * step;
92 auto end = MSMIN(start + step, outer_size_);
93 auto input_num = in_tensors_.size();
94 MS_CHECK_FALSE(INT_MUL_OVERFLOW(input_num * start, copy_size_), RET_ERROR);
95 void *output = reinterpret_cast<char *>(output_data) + input_num * start * copy_size_;
96 Stack(inputs, reinterpret_cast<void *>(output), input_num, copy_size_, start, end);
97 return RET_OK;
98 }
99
StackRun(void * cdata,int task_id,float lhs_scale,float rhs_scale)100 static int StackRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
101 auto stack = reinterpret_cast<StackFp16CPUKernel *>(cdata);
102 if (stack->DoExecute(task_id) != RET_OK) {
103 return RET_ERROR;
104 }
105 return RET_OK;
106 }
107
Run()108 int StackFp16CPUKernel::Run() {
109 InitMallocFlags();
110 auto ret = MallocAssignBuffer();
111 if (ret != RET_OK) {
112 FreeBuffer();
113 return ret;
114 }
115 // run stack
116 num_threads_ = MSMIN(UP_DIV(outer_size_, kStackStep), this->op_parameter_->thread_num_);
117 ret = ParallelLaunch(this->ms_context_, StackRun, this, num_threads_);
118 if (ret != RET_OK) {
119 MS_LOG(ERROR) << "StackBaseCPUKernel Run error: error_code[" << ret << "]";
120 return RET_ERROR;
121 }
122 // if output tensor is fp32, we need to transform
123 if (malloc_out_) {
124 auto out_tensor = out_tensors_.at(0);
125 MS_ASSERT(out_tensor != nullptr);
126 MS_ASSERT(out_tensor->data() != nullptr);
127 Float16ToFloat32(out_buffer_, reinterpret_cast<float *>(out_tensor->data()), out_tensor->ElementsNum());
128 }
129 FreeBuffer();
130 return RET_OK;
131 }
132
133 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Stack, LiteKernelCreator<StackFp16CPUKernel>)
134 } // namespace mindspore::kernel
135