1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/delegate/npu/npu_delegate.h"
18 #include <queue>
19 #include "include/errorcode.h"
20 #include "src/common/prim_util.h"
21 #include "src/delegate/npu/op/npu_op.h"
22 #include "src/delegate/npu/op/activation_npu.h"
23 #include "src/delegate/npu/op/argmax_npu.h"
24 #include "src/delegate/npu/op/arithmetic_npu.h"
25 #include "src/delegate/npu/op/arithmetic_self_npu.h"
26 #include "src/delegate/npu/op/avg_pooling_npu.h"
27 #include "src/delegate/npu/op/batchnorm_npu.h"
28 #include "src/delegate/npu/op/cast_npu.h"
29 #include "src/delegate/npu/op/concat_npu.h"
30 #include "src/delegate/npu/op/convolution_npu.h"
31 #include "src/delegate/npu/op/crop_and_resize_npu.h"
32 #include "src/delegate/npu/op/deconvolution_npu.h"
33 #include "src/delegate/npu/op/eltwise_npu.h"
34 #include "src/delegate/npu/op/expand_dims_npu.h"
35 #include "src/delegate/npu/op/fullconnection_npu.h"
36 #include "src/delegate/npu/op/gather_npu.h"
37 #include "src/delegate/npu/op/instance_norm_npu.h"
38 #include "src/delegate/npu/op/matmul_npu.h"
39 #include "src/delegate/npu/op/max_pooling_npu.h"
40 #include "src/delegate/npu/op/pad_npu.h"
41 #include "src/delegate/npu/op/reduce_npu.h"
42 #include "src/delegate/npu/op/reshape_npu.h"
43 #include "src/delegate/npu/op/resize_npu.h"
44 #include "src/delegate/npu/op/scale_npu.h"
45 #include "src/delegate/npu/op/slice_npu.h"
46 #include "src/delegate/npu/op/softmax_npu.h"
47 #include "src/delegate/npu/op/split_npu.h"
48 #include "src/delegate/npu/op/squeeze_npu.h"
49 #include "src/delegate/npu/op/strided_slice_npu.h"
50 #include "src/delegate/npu/op/tile_npu.h"
51 #include "src/delegate/npu/op/transpose_npu.h"
52 #include "src/delegate/npu/op/unsqueeze_npu.h"
53 #include "src/delegate/npu/npu_graph.h"
54 #include "src/delegate/delegate_utils.h"
55 #include "src/delegate/npu/pass/npu_transform_pass.h"
56 #include "src/delegate/npu/pass/npu_insert_transform_pass.h"
57 #include "src/delegate/npu/pass/npu_fusion_pass.h"
58
59 using mindspore::lite::RET_ERROR;
60 using mindspore::lite::RET_OK;
61
62 namespace mindspore {
~NPUDelegate()63 NPUDelegate::~NPUDelegate() {
64 if (npu_manager_ != nullptr) {
65 npu_manager_->Reset();
66 delete npu_manager_;
67 npu_manager_ = nullptr;
68 }
69 if (pass_manager_ != nullptr) {
70 pass_manager_->Clear();
71 delete pass_manager_;
72 pass_manager_ = nullptr;
73 }
74 }
75
Init()76 Status NPUDelegate::Init() {
77 npu_manager_ = new (std::nothrow) NPUManager(frequency_);
78 if (npu_manager_ == nullptr) {
79 MS_LOG(ERROR) << "New npu manager failed.";
80 return mindspore::kLiteNullptr;
81 }
82 if (!npu_manager_->IsSupportNPU()) {
83 MS_LOG(DEBUG) << "Checking that npu is unsupported.";
84 free(npu_manager_);
85 npu_manager_ = nullptr;
86 return mindspore::kLiteNotSupport;
87 }
88 pass_manager_ = new (std::nothrow) NPUPassManager();
89 if (pass_manager_ == nullptr) {
90 free(npu_manager_);
91 npu_manager_ = nullptr;
92 MS_LOG(ERROR) << "New npu pass manager failed.";
93 return mindspore::kLiteNullptr;
94 }
95 auto transform_pass = new (std::nothrow) NPUTransformPass();
96 pass_manager_->AddPass(transform_pass);
97 auto insert_transform_pass = new (std::nothrow) NPUInsertTransformPass();
98 pass_manager_->AddPass(insert_transform_pass);
99 auto fusion_pass = new (std::nothrow) NPUFusionPass();
100 pass_manager_->AddPass(fusion_pass);
101
102 op_func_lists_.clear();
103 op_func_lists_ = {
104 {schema::PrimitiveType_Activation, GetNPUOp<ActivationNPUOp>},
105 {schema::PrimitiveType_ArgMaxFusion, GetNPUOp<ArgmaxNPUOp>},
106 {schema::PrimitiveType_MulFusion, GetNPUOp<ArithmeticNPUOp>},
107 {schema::PrimitiveType_AddFusion, GetNPUOp<ArithmeticNPUOp>},
108 {schema::PrimitiveType_SubFusion, GetNPUOp<ArithmeticNPUOp>},
109 {schema::PrimitiveType_DivFusion, GetNPUOp<ArithmeticNPUOp>},
110 {schema::PrimitiveType_FloorMod, GetNPUOp<ArithmeticNPUOp>},
111 {schema::PrimitiveType_FloorDiv, GetNPUOp<ArithmeticNPUOp>},
112 {schema::PrimitiveType_LogicalAnd, GetNPUOp<ArithmeticNPUOp>},
113 {schema::PrimitiveType_LogicalOr, GetNPUOp<ArithmeticNPUOp>},
114 {schema::PrimitiveType_Maximum, GetNPUOp<ArithmeticNPUOp>},
115 {schema::PrimitiveType_Minimum, GetNPUOp<ArithmeticNPUOp>},
116 {schema::PrimitiveType_NotEqual, GetNPUOp<ArithmeticNPUOp>},
117 {schema::PrimitiveType_Equal, GetNPUOp<ArithmeticNPUOp>},
118 {schema::PrimitiveType_Less, GetNPUOp<ArithmeticNPUOp>},
119 {schema::PrimitiveType_LessEqual, GetNPUOp<ArithmeticNPUOp>},
120 {schema::PrimitiveType_Greater, GetNPUOp<ArithmeticNPUOp>},
121 {schema::PrimitiveType_GreaterEqual, GetNPUOp<ArithmeticNPUOp>},
122 {schema::PrimitiveType_Ceil, GetNPUOp<ArithmeticSelfNPUOp>},
123 {schema::PrimitiveType_Cos, GetNPUOp<ArithmeticSelfNPUOp>},
124 {schema::PrimitiveType_Floor, GetNPUOp<ArithmeticSelfNPUOp>},
125 {schema::PrimitiveType_Log, GetNPUOp<ArithmeticSelfNPUOp>},
126 {schema::PrimitiveType_LogicalNot, GetNPUOp<ArithmeticSelfNPUOp>},
127 {schema::PrimitiveType_Neg, GetNPUOp<ArithmeticSelfNPUOp>},
128 {schema::PrimitiveType_Reciprocal, GetNPUOp<ArithmeticSelfNPUOp>},
129 {schema::PrimitiveType_Round, GetNPUOp<ArithmeticSelfNPUOp>},
130 {schema::PrimitiveType_Rsqrt, GetNPUOp<ArithmeticSelfNPUOp>},
131 {schema::PrimitiveType_Sin, GetNPUOp<ArithmeticSelfNPUOp>},
132 {schema::PrimitiveType_Sqrt, GetNPUOp<ArithmeticSelfNPUOp>},
133 {schema::PrimitiveType_Square, GetNPUOp<ArithmeticSelfNPUOp>},
134 {schema::PrimitiveType_AvgPoolFusion, GetNPUOp<AvgPoolingNPUOp>},
135 {schema::PrimitiveType_MaxPoolFusion, GetNPUOp<MaxPoolingNPUOp>},
136 {schema::PrimitiveType_FusedBatchNorm, GetNPUOp<BatchnormNPUOp>},
137 {schema::PrimitiveType_Cast, GetNPUOp<CastNPUOp>},
138 {schema::PrimitiveType_Concat, GetNPUOp<ConcatNPUOp>},
139 {schema::PrimitiveType_Conv2dTransposeFusion, GetNPUOp<DeconvolutionNPUOp>},
140 {schema::PrimitiveType_CropAndResize, GetNPUOp<CropAndResizeNPUOp>},
141 {schema::PrimitiveType_Eltwise, GetNPUOp<EltwiseNPUOp>},
142 {schema::PrimitiveType_ExpandDims, GetNPUOp<ExpandDimsNPUOp>},
143 {schema::PrimitiveType_FullConnection, GetNPUOp<FullconnectionNPUOp>},
144 {schema::PrimitiveType_Gather, GetNPUOp<GatherNPUOp>},
145 {schema::PrimitiveType_InstanceNorm, GetNPUOp<InstanceNormNPUOp>},
146 {schema::PrimitiveType_MatMulFusion, GetNPUOp<MatMulNPUOp>},
147 {schema::PrimitiveType_PadFusion, GetNPUOp<PadNPUOp>},
148 {schema::PrimitiveType_ReduceFusion, GetNPUOp<ReduceNPUOp>},
149 {schema::PrimitiveType_Reshape, GetNPUOp<ReshapeNPUOp>},
150 {schema::PrimitiveType_Resize, GetNPUOp<ResizeNPUOp>},
151 {schema::PrimitiveType_ScaleFusion, GetNPUOp<ScaleNPUOp>},
152 {schema::PrimitiveType_SliceFusion, GetNPUOp<SliceNPUOp>},
153 {schema::PrimitiveType_Softmax, GetNPUOp<SoftmaxNPUOp>},
154 {schema::PrimitiveType_Split, GetNPUOp<SplitNPUOp>},
155 {schema::PrimitiveType_Squeeze, GetNPUOp<SqueezeNPUOp>},
156 {schema::PrimitiveType_StridedSlice, GetNPUOp<StridedSliceNPUOp>},
157 {schema::PrimitiveType_TileFusion, GetNPUOp<TileNPUOp>},
158 {schema::PrimitiveType_Transpose, GetNPUOp<TransposeNPUOp>},
159 {schema::PrimitiveType_Unsqueeze, GetNPUOp<UnsqueezeNPUOp>},
160 };
161 return mindspore::kSuccess;
162 }
163
Build(DelegateModel<schema::Primitive> * model)164 Status NPUDelegate::Build(DelegateModel<schema::Primitive> *model) {
165 KernelIter from, end;
166 std::vector<NPUOp *> npu_ops;
167 int graph_index = 0;
168 for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) {
169 kernel::Kernel *kernel = *iter;
170 auto npu_op = GetOP(kernel, model->GetPrimitive(kernel));
171 if (npu_op != nullptr) {
172 // If npu_op does not equal nullptr, this kernel can be supported by delegate
173 if (npu_ops.size() == 0) {
174 from = iter;
175 }
176 npu_ops.push_back(npu_op);
177 end = iter;
178 } else {
179 if (npu_ops.size() > 0) {
180 auto npu_graph_kernel = CreateNPUGraph(npu_ops, model, from, end);
181 if (npu_graph_kernel == nullptr) {
182 MS_LOG(ERROR) << "Create NPU Graph failed.";
183 return mindspore::kLiteNullptr;
184 }
185 npu_graph_kernel->set_name("NpuGraph" + std::to_string(graph_index++));
186 iter = model->Replace(from, end + 1, npu_graph_kernel);
187 npu_ops.clear();
188 }
189 }
190 }
191 if (npu_ops.size() > 0) {
192 auto npu_graph_kernel = CreateNPUGraph(npu_ops, model, from, end);
193 if (npu_graph_kernel == nullptr) {
194 MS_LOG(ERROR) << "Create NPU Graph failed.";
195 return mindspore::kLiteNullptr;
196 }
197 npu_graph_kernel->set_name("NpuGraph" + std::to_string(graph_index++));
198 model->Replace(from, end + 1, npu_graph_kernel);
199 npu_ops.clear();
200 }
201 auto ret = npu_manager_->LoadOMModel();
202 if (ret != RET_OK) {
203 MS_LOG(ERROR) << "NPU client load model failed.";
204 return mindspore::kLiteError;
205 }
206 return mindspore::kSuccess;
207 }
208
GetOP(kernel::Kernel * kernel,const schema::Primitive * primitive)209 NPUOp *NPUDelegate::GetOP(kernel::Kernel *kernel, const schema::Primitive *primitive) {
210 if (primitive == nullptr) {
211 MS_LOG(ERROR) << "primitive is NULL!";
212 return nullptr;
213 }
214 if (kernel == nullptr) {
215 MS_LOG(ERROR) << "kernel is NULL!";
216 return nullptr;
217 }
218 auto name = kernel->name();
219 NPUOp *npu_op = nullptr;
220 auto node_type = primitive->value_type();
221 if (node_type == schema::PrimitiveType_Conv2DFusion) {
222 npu_op = GetNPUConvOp(primitive, kernel->inputs(), kernel->outputs(), name);
223 } else {
224 if (op_func_lists_.find(node_type) != op_func_lists_.end()) {
225 npu_op = op_func_lists_[node_type](primitive, kernel->inputs(), kernel->outputs(), name);
226 } else {
227 MS_LOG(DEBUG) << "Unsupported op type for NPU.";
228 return nullptr;
229 }
230 }
231
232 for (int i = 0; i < kernel->inputs().size(); i++) {
233 mindspore::MSTensor tensor = kernel->inputs()[i];
234 if (tensor.DataType() == DataType::kNumberTypeFloat16 && tensor.Data() == nullptr) {
235 tensor.SetDataType(DataType::kNumberTypeFloat32);
236 }
237 }
238 for (int i = 0; i < kernel->outputs().size(); i++) {
239 mindspore::MSTensor tensor = kernel->outputs()[i];
240 if (tensor.DataType() == DataType::kNumberTypeFloat16) {
241 tensor.SetDataType(DataType::kNumberTypeFloat32);
242 }
243 }
244
245 if (npu_op != nullptr) {
246 MS_LOG(DEBUG) << "kernel: [" << kernel->name().c_str() << "] op success. "
247 << "op_type: " << lite::PrimitiveCurVersionTypeName(kernel->type()) << ", "
248 << "arch: " << kKirinNPU;
249 }
250 return npu_op;
251 }
252
GraphOutTensors(const std::vector<NPUOp * > & ops,DelegateModel<schema::Primitive> * model,KernelIter from,KernelIter end)253 std::vector<mindspore::MSTensor> GraphOutTensors(const std::vector<NPUOp *> &ops,
254 DelegateModel<schema::Primitive> *model, KernelIter from,
255 KernelIter end) {
256 auto out_tensors = lite::GetGraphOutTensors(ops);
257 std::vector<mindspore::MSTensor> all_out_tensors;
258 for (auto op : ops) {
259 for (auto out_tensor : op->outputs()) {
260 if (find(out_tensors.begin(), out_tensors.end(), out_tensor) == out_tensors.end()) {
261 all_out_tensors.push_back(out_tensor);
262 }
263 }
264 }
265
266 for (KernelIter iter = model->BeginKernelIterator(); iter != model->EndKernelIterator(); iter++) {
267 if (iter >= from && iter <= end) {
268 continue;
269 }
270 // The input of other kernels is the output of the current subgraph kernel.
271 for (auto in_tensor : (*iter)->inputs()) {
272 if (find(all_out_tensors.begin(), all_out_tensors.end(), in_tensor) != all_out_tensors.end() &&
273 find(out_tensors.begin(), out_tensors.end(), in_tensor) == out_tensors.end()) {
274 out_tensors.push_back(in_tensor);
275 }
276 }
277 }
278 return out_tensors;
279 }
280
CreateNPUGraph(const std::vector<NPUOp * > & ops,DelegateModel<schema::Primitive> * model,KernelIter from,KernelIter end)281 kernel::Kernel *NPUDelegate::CreateNPUGraph(const std::vector<NPUOp *> &ops, DelegateModel<schema::Primitive> *model,
282 KernelIter from, KernelIter end) {
283 auto in_tensors = lite::GetGraphInTensors(ops);
284 auto out_tensors = GraphOutTensors(ops, model, from, end);
285 auto graph_kernel = new (std::nothrow) NPUGraph(ops, npu_manager_, in_tensors, out_tensors);
286 if (graph_kernel == nullptr) {
287 MS_LOG(DEBUG) << "New NPU Graph failed.";
288 return nullptr;
289 }
290 // 1. For every op, find pre and next ops
291 auto ret = graph_kernel->FindPreNextOps();
292 if (ret != RET_OK) {
293 MS_LOG(DEBUG) << "NPU Graph find input and output ops for every op failed.";
294 return nullptr;
295 }
296 // 2. Pass
297 ret = pass_manager_->RunPass(graph_kernel);
298 if (ret != RET_OK) {
299 MS_LOG(DEBUG) << "NPU Graph run pass failed. This function mainly solves the problem that the format is "
300 "inconsistent and requires interpolation transpose operators.";
301 return nullptr;
302 }
303 // 3. NPUGraph init, create subgraph_kernel and transpose_kernel
304 ret = graph_kernel->Init();
305 if (ret != RET_OK) {
306 MS_LOG(DEBUG) << "NPU subgraph Init failed.";
307 return nullptr;
308 }
309 return graph_kernel;
310 }
311 } // namespace mindspore
312