1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/delegate/npu/npu_graph.h"
18 #include <queue>
19 #include "src/delegate/npu/npu_subgraph.h"
20 #include "src/delegate/delegate_utils.h"
21 #include "src/delegate/npu/op/transpose_npu.h"
22 #include "src/delegate/npu/transpose_kernel.h"
23 namespace mindspore {
~NPUGraph()24 NPUGraph::~NPUGraph() {
25 for (auto *kernel : all_kernels_) {
26 delete kernel;
27 }
28 for (auto *op : npu_ops_) {
29 delete op;
30 }
31 for (auto tensor : insert_tensors_) {
32 MSTensor::DestroyTensorPtr(tensor);
33 }
34 }
35
set_input(mindspore::MSTensor in_tensor,int index)36 void NPUGraph::set_input(mindspore::MSTensor in_tensor, int index) {
37 MS_ASSERT(index < inputs_.size());
38 auto origin_tensor = this->inputs_[index];
39 for (auto kernel : all_kernels_) {
40 for (size_t i = 0; i < kernel->inputs().size(); i++) {
41 if (kernel->inputs()[i] == origin_tensor) {
42 kernel->set_input(in_tensor, i);
43 }
44 }
45 }
46 this->inputs_[index] = in_tensor;
47 }
48
set_output(mindspore::MSTensor out_tensor,int index)49 void NPUGraph::set_output(mindspore::MSTensor out_tensor, int index) {
50 MS_ASSERT(index < outputs_.size());
51 auto origin_tensor = this->outputs_[index];
52 for (auto kernel : all_kernels_) {
53 for (size_t i = 0; i < kernel->outputs().size(); i++) {
54 if (kernel->outputs()[i] == origin_tensor) {
55 kernel->set_output(out_tensor, i);
56 }
57 }
58 }
59 this->outputs_[index] = out_tensor;
60 }
61
Init()62 int NPUGraph::Init() {
63 all_kernels_.clear();
64 std::map<const NPUOp *, bool> is_visited;
65 std::map<const NPUOp *, bool> is_searched;
66 std::queue<NPUOp *> candidate_in_ops;
67 std::queue<NPUOp *> valid_in_ops;
68 // Initialization
69 for (auto op : npu_ops_) {
70 is_visited[op] = false;
71 is_searched[op] = false;
72 if (op->in_ops().empty()) {
73 candidate_in_ops.push(op);
74 }
75 }
76 while (!candidate_in_ops.empty()) {
77 // 1. Find out all input ops except transpose, and handle transpose ops independently.
78 auto ret = FindValidSubgraphInOps(&valid_in_ops, &candidate_in_ops, &is_visited);
79 if (ret != RET_OK) {
80 MS_LOG(DEBUG) << "Fail to find valid input ops or handle transpose ops.";
81 return RET_ERROR;
82 }
83 if (valid_in_ops.empty()) {
84 MS_LOG(INFO) << "Can not find input ops except transpose.";
85 break;
86 }
87 // 2. Find out all ready ops based on valid input ops, but these ops maybe not belong to the same subgraph.
88 auto ready_ops = FindReadySubgraphOps(valid_in_ops, &candidate_in_ops, &is_visited);
89 // 3. Create subgraph(s). Input ops with connection will be built into a same subgraph.
90 ret = CreateSubgraphFromReadyOps(&valid_in_ops, ready_ops, &is_searched);
91 if (ret != RET_OK) {
92 MS_LOG(DEBUG) << "Fail to create subgraph(s) from ready ops.";
93 return RET_ERROR;
94 }
95 }
96 return RET_OK;
97 }
98
FindPreOps(NPUOp * cur_op)99 std::vector<NPUOp *> NPUGraph::FindPreOps(NPUOp *cur_op) {
100 std::vector<NPUOp *> in_ops;
101 for (auto in_tensor : cur_op->inputs()) {
102 for (auto op : npu_ops_) {
103 if (find(op->outputs().begin(), op->outputs().end(), in_tensor) != op->outputs().end()) {
104 in_ops.emplace_back(op);
105 }
106 }
107 }
108 return in_ops;
109 }
110
FindNextOps(NPUOp * cur_op)111 std::vector<NPUOp *> NPUGraph::FindNextOps(NPUOp *cur_op) {
112 std::vector<NPUOp *> out_ops;
113 for (auto out_tensor : cur_op->outputs()) {
114 for (auto op : npu_ops_) {
115 if (find(op->inputs().begin(), op->inputs().end(), out_tensor) != op->inputs().end()) {
116 out_ops.emplace_back(op);
117 }
118 }
119 }
120 return out_ops;
121 }
122
FindPreNextOps()123 int NPUGraph::FindPreNextOps() {
124 for (auto op : npu_ops_) {
125 auto in_ops = FindPreOps(op);
126 op->set_in_ops(in_ops);
127 auto out_ops = FindNextOps(op);
128 op->set_out_ops(out_ops);
129 }
130 return RET_OK;
131 }
132
FindValidSubgraphInOps(std::queue<NPUOp * > * valid_in_ops,std::queue<NPUOp * > * candidate_in_ops,std::map<const NPUOp *,bool> * is_visited)133 int NPUGraph::FindValidSubgraphInOps(std::queue<NPUOp *> *valid_in_ops, std::queue<NPUOp *> *candidate_in_ops,
134 std::map<const NPUOp *, bool> *is_visited) {
135 while (!candidate_in_ops->empty()) {
136 auto cur_op = candidate_in_ops->front();
137 candidate_in_ops->pop();
138 if ((*is_visited)[cur_op]) {
139 continue;
140 }
141 if (cur_op->type() == schema::PrimitiveType_Transpose) {
142 auto transpose_kernel = CreateNPUTransposeKernel(cur_op);
143 if (transpose_kernel == nullptr) {
144 MS_LOG(DEBUG) << "New NPU transpose kernel failed.";
145 return RET_ERROR;
146 }
147 all_kernels_.emplace_back(transpose_kernel);
148 (*is_visited)[cur_op] = true;
149 for (auto out_op : cur_op->out_ops()) {
150 if (out_op->type() == schema::PrimitiveType_Transpose) {
151 candidate_in_ops->push(out_op);
152 } else {
153 auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
154 [&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
155 if (input_ready) {
156 valid_in_ops->push(out_op);
157 }
158 }
159 }
160 } else {
161 valid_in_ops->push(cur_op);
162 }
163 }
164 return RET_OK;
165 }
166
FindReadySubgraphOps(std::queue<NPUOp * > op_queue,std::queue<NPUOp * > * next_candidate_ops,std::map<const NPUOp *,bool> * is_visited)167 std::vector<NPUOp *> NPUGraph::FindReadySubgraphOps(std::queue<NPUOp *> op_queue,
168 std::queue<NPUOp *> *next_candidate_ops,
169 std::map<const NPUOp *, bool> *is_visited) {
170 std::vector<NPUOp *> subgraph_ops;
171 while (!op_queue.empty()) {
172 auto cur_op = op_queue.front();
173 op_queue.pop();
174 if ((*is_visited)[cur_op]) {
175 continue;
176 }
177 subgraph_ops.emplace_back(cur_op);
178 (*is_visited)[cur_op] = true;
179 auto out_ops = cur_op->out_ops();
180 for (auto out_op : out_ops) {
181 if ((*is_visited)[out_op]) {
182 continue;
183 }
184 auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
185 [&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
186 if (out_op->type() == schema::PrimitiveType_Transpose) {
187 next_candidate_ops->push(out_op);
188 } else if (input_ready) {
189 op_queue.push(out_op);
190 }
191 }
192 }
193 return subgraph_ops;
194 }
195
FindConnectedOps(NPUOp * head_op,std::vector<NPUOp * > ready_ops,std::vector<NPUOp * > * connected_ops,std::map<const NPUOp *,bool> * is_searched)196 void FindConnectedOps(NPUOp *head_op, std::vector<NPUOp *> ready_ops, std::vector<NPUOp *> *connected_ops,
197 std::map<const NPUOp *, bool> *is_searched) {
198 std::queue<NPUOp *> bfs_ops;
199 bfs_ops.push(head_op);
200 while (!bfs_ops.empty()) {
201 auto cur_op = bfs_ops.front();
202 bfs_ops.pop();
203 if ((*is_searched)[cur_op]) {
204 continue;
205 }
206 for (auto in_op : cur_op->in_ops()) {
207 if (std::find(ready_ops.begin(), ready_ops.end(), in_op) == ready_ops.end() || (*is_searched)[in_op]) {
208 continue;
209 }
210 bfs_ops.push(in_op);
211 }
212 for (auto out_op : cur_op->out_ops()) {
213 if (std::find(ready_ops.begin(), ready_ops.end(), out_op) == ready_ops.end() || (*is_searched)[out_op]) {
214 continue;
215 }
216 bfs_ops.push(out_op);
217 }
218 (*is_searched)[cur_op] = true;
219 connected_ops->emplace_back(cur_op);
220 }
221 return;
222 }
223
CreateSubgraphFromReadyOps(std::queue<NPUOp * > * valid_in_ops,std::vector<NPUOp * > ready_ops,std::map<const NPUOp *,bool> * is_searched)224 int NPUGraph::CreateSubgraphFromReadyOps(std::queue<NPUOp *> *valid_in_ops, std::vector<NPUOp *> ready_ops,
225 std::map<const NPUOp *, bool> *is_searched) {
226 while (!valid_in_ops->empty()) {
227 std::vector<NPUOp *> connected_ops;
228 auto op = valid_in_ops->front();
229 valid_in_ops->pop();
230 if ((*is_searched)[op]) {
231 continue;
232 }
233 if (!valid_in_ops->empty()) {
234 // use BFS to find out connected input ops
235 FindConnectedOps(op, ready_ops, &connected_ops, is_searched);
236 } else {
237 // if current input op is the only input op, there is no need to confirm the connectivity
238 for (auto ready_op : ready_ops) {
239 if (!(*is_searched)[ready_op]) {
240 connected_ops.emplace_back(ready_op);
241 (*is_searched)[ready_op] = true;
242 }
243 }
244 }
245 auto subgraph_kernel = CreateNPUSubgraphKernel(connected_ops);
246 if (subgraph_kernel == nullptr) {
247 MS_LOG(DEBUG) << "Create NPU subgraph kernel failed.";
248 return RET_ERROR;
249 }
250 all_kernels_.emplace_back(subgraph_kernel);
251 }
252 return RET_OK;
253 }
254
CreateNPUSubgraphKernel(std::vector<NPUOp * > npu_ops)255 kernel::Kernel *NPUGraph::CreateNPUSubgraphKernel(std::vector<NPUOp *> npu_ops) {
256 auto subgraph = new (std::nothrow) NPUSubGraph(npu_ops, npu_manager_);
257 if (subgraph == nullptr) {
258 MS_LOG(ERROR) << "New NPU Subgraph failed.";
259 return nullptr;
260 }
261 subgraph->set_inputs(lite::GetGraphInTensors(npu_ops));
262 // The output of NPUGraph should be assigned to the corresponding NPUSubgraph
263 auto subgraph_outputs = lite::GetGraphOutTensors(npu_ops);
264 for (auto graph_output : this->outputs()) {
265 for (auto subgraph_op : npu_ops) {
266 auto subgraph_op_outputs = subgraph_op->outputs();
267 if (find(subgraph_op_outputs.begin(), subgraph_op_outputs.end(), graph_output) != subgraph_op_outputs.end() &&
268 find(subgraph_outputs.begin(), subgraph_outputs.end(), graph_output) == subgraph_outputs.end()) {
269 subgraph_outputs.emplace_back(graph_output);
270 break;
271 }
272 }
273 }
274 subgraph->set_outputs(subgraph_outputs);
275 auto ret = subgraph->Init();
276 if (ret != RET_OK) {
277 MS_LOG(ERROR) << "NPU Subgraph Init failed.";
278 return nullptr;
279 }
280 return subgraph;
281 }
282
CreateNPUTransposeKernel(NPUOp * op)283 kernel::Kernel *NPUGraph::CreateNPUTransposeKernel(NPUOp *op) {
284 if (op->type() != schema::PrimitiveType_Transpose) {
285 MS_LOG(ERROR) << "Check npu transpose op failed.";
286 return nullptr;
287 }
288 auto transpose_op = static_cast<TransposeNPUOp *>(op);
289 auto transpose_kernel = new (std::nothrow)
290 TransposeNPUKernel(transpose_op->inputs(), transpose_op->outputs(), transpose_op->GetPerm(), transpose_op->name());
291 if (transpose_kernel == nullptr) {
292 MS_LOG(ERROR) << "New npu transpose kernel failed.";
293 return nullptr;
294 }
295 return transpose_kernel;
296 }
297
Prepare()298 int NPUGraph::Prepare() {
299 for (int i = 0; i < all_kernels_.size(); i++) {
300 auto ret = all_kernels_[i]->Prepare();
301 if (ret != RET_OK) {
302 MS_LOG(ERROR) << "NPU Subgraph " << all_kernels_[i]->name() << " prepare failed.";
303 return RET_ERROR;
304 }
305 for (auto output : all_kernels_[i]->outputs()) {
306 if (find(outputs_.begin(), outputs_.end(), output) == outputs_.end()) {
307 if (output.MutableData() == nullptr) {
308 MS_LOG(ERROR) << "NPU Subgraph " << all_kernels_[i]->name() << " prepare malloc output tensor failed.";
309 return RET_ERROR;
310 }
311 }
312 }
313 }
314 return RET_OK;
315 }
316
Execute()317 int NPUGraph::Execute() {
318 for (int i = 0; i < all_kernels_.size(); i++) {
319 // 1. malloc graph output data
320 for (auto output : all_kernels_[i]->outputs()) {
321 if (find(outputs_.begin(), outputs_.end(), output) != outputs_.end()) {
322 if (output.MutableData() == nullptr) {
323 MS_LOG(ERROR) << "NPU Subgraph " << output.Name() << " execute malloc output tensor failed.";
324 return RET_ERROR;
325 }
326 }
327 }
328 // 2. execute
329 auto ret = all_kernels_[i]->Execute();
330 if (ret != RET_OK) {
331 MS_LOG(ERROR) << "NPU Subgraph " << all_kernels_[i]->name() << " execute failed.";
332 return RET_ERROR;
333 }
334 }
335 return RET_OK;
336 }
337 } // namespace mindspore
338