• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/delegate/npu/npu_graph.h"
18 #include <queue>
19 #include "src/delegate/npu/npu_subgraph.h"
20 #include "src/delegate/delegate_utils.h"
21 #include "src/delegate/npu/op/transpose_npu.h"
22 #include "src/delegate/npu/transpose_kernel.h"
23 namespace mindspore {
~NPUGraph()24 NPUGraph::~NPUGraph() {
25   for (auto *kernel : all_kernels_) {
26     delete kernel;
27   }
28   for (auto *op : npu_ops_) {
29     delete op;
30   }
31   for (auto tensor : insert_tensors_) {
32     MSTensor::DestroyTensorPtr(tensor);
33   }
34 }
35 
set_input(mindspore::MSTensor in_tensor,int index)36 void NPUGraph::set_input(mindspore::MSTensor in_tensor, int index) {
37   MS_ASSERT(index < inputs_.size());
38   auto origin_tensor = this->inputs_[index];
39   for (auto kernel : all_kernels_) {
40     for (size_t i = 0; i < kernel->inputs().size(); i++) {
41       if (kernel->inputs()[i] == origin_tensor) {
42         kernel->set_input(in_tensor, i);
43       }
44     }
45   }
46   this->inputs_[index] = in_tensor;
47 }
48 
set_output(mindspore::MSTensor out_tensor,int index)49 void NPUGraph::set_output(mindspore::MSTensor out_tensor, int index) {
50   MS_ASSERT(index < outputs_.size());
51   auto origin_tensor = this->outputs_[index];
52   for (auto kernel : all_kernels_) {
53     for (size_t i = 0; i < kernel->outputs().size(); i++) {
54       if (kernel->outputs()[i] == origin_tensor) {
55         kernel->set_output(out_tensor, i);
56       }
57     }
58   }
59   this->outputs_[index] = out_tensor;
60 }
61 
Init()62 int NPUGraph::Init() {
63   all_kernels_.clear();
64   std::map<const NPUOp *, bool> is_visited;
65   std::map<const NPUOp *, bool> is_searched;
66   std::queue<NPUOp *> candidate_in_ops;
67   std::queue<NPUOp *> valid_in_ops;
68   // Initialization
69   for (auto op : npu_ops_) {
70     is_visited[op] = false;
71     is_searched[op] = false;
72     if (op->in_ops().empty()) {
73       candidate_in_ops.push(op);
74     }
75   }
76   while (!candidate_in_ops.empty()) {
77     // 1. Find out all input ops except transpose, and handle transpose ops independently.
78     auto ret = FindValidSubgraphInOps(&valid_in_ops, &candidate_in_ops, &is_visited);
79     if (ret != RET_OK) {
80       MS_LOG(DEBUG) << "Fail to find valid input ops or handle transpose ops.";
81       return RET_ERROR;
82     }
83     if (valid_in_ops.empty()) {
84       MS_LOG(INFO) << "Can not find input ops except transpose.";
85       break;
86     }
87     // 2. Find out all ready ops based on valid input ops, but these ops maybe not belong to the same subgraph.
88     auto ready_ops = FindReadySubgraphOps(valid_in_ops, &candidate_in_ops, &is_visited);
89     // 3. Create subgraph(s). Input ops with connection will be built into a same subgraph.
90     ret = CreateSubgraphFromReadyOps(&valid_in_ops, ready_ops, &is_searched);
91     if (ret != RET_OK) {
92       MS_LOG(DEBUG) << "Fail to create subgraph(s) from ready ops.";
93       return RET_ERROR;
94     }
95   }
96   return RET_OK;
97 }
98 
FindPreOps(NPUOp * cur_op)99 std::vector<NPUOp *> NPUGraph::FindPreOps(NPUOp *cur_op) {
100   std::vector<NPUOp *> in_ops;
101   for (auto in_tensor : cur_op->inputs()) {
102     for (auto op : npu_ops_) {
103       if (find(op->outputs().begin(), op->outputs().end(), in_tensor) != op->outputs().end()) {
104         in_ops.emplace_back(op);
105       }
106     }
107   }
108   return in_ops;
109 }
110 
FindNextOps(NPUOp * cur_op)111 std::vector<NPUOp *> NPUGraph::FindNextOps(NPUOp *cur_op) {
112   std::vector<NPUOp *> out_ops;
113   for (auto out_tensor : cur_op->outputs()) {
114     for (auto op : npu_ops_) {
115       if (find(op->inputs().begin(), op->inputs().end(), out_tensor) != op->inputs().end()) {
116         out_ops.emplace_back(op);
117       }
118     }
119   }
120   return out_ops;
121 }
122 
FindPreNextOps()123 int NPUGraph::FindPreNextOps() {
124   for (auto op : npu_ops_) {
125     auto in_ops = FindPreOps(op);
126     op->set_in_ops(in_ops);
127     auto out_ops = FindNextOps(op);
128     op->set_out_ops(out_ops);
129   }
130   return RET_OK;
131 }
132 
FindValidSubgraphInOps(std::queue<NPUOp * > * valid_in_ops,std::queue<NPUOp * > * candidate_in_ops,std::map<const NPUOp *,bool> * is_visited)133 int NPUGraph::FindValidSubgraphInOps(std::queue<NPUOp *> *valid_in_ops, std::queue<NPUOp *> *candidate_in_ops,
134                                      std::map<const NPUOp *, bool> *is_visited) {
135   while (!candidate_in_ops->empty()) {
136     auto cur_op = candidate_in_ops->front();
137     candidate_in_ops->pop();
138     if ((*is_visited)[cur_op]) {
139       continue;
140     }
141     if (cur_op->type() == schema::PrimitiveType_Transpose) {
142       auto transpose_kernel = CreateNPUTransposeKernel(cur_op);
143       if (transpose_kernel == nullptr) {
144         MS_LOG(DEBUG) << "New NPU transpose kernel failed.";
145         return RET_ERROR;
146       }
147       all_kernels_.emplace_back(transpose_kernel);
148       (*is_visited)[cur_op] = true;
149       for (auto out_op : cur_op->out_ops()) {
150         if (out_op->type() == schema::PrimitiveType_Transpose) {
151           candidate_in_ops->push(out_op);
152         } else {
153           auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
154                                          [&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
155           if (input_ready) {
156             valid_in_ops->push(out_op);
157           }
158         }
159       }
160     } else {
161       valid_in_ops->push(cur_op);
162     }
163   }
164   return RET_OK;
165 }
166 
FindReadySubgraphOps(std::queue<NPUOp * > op_queue,std::queue<NPUOp * > * next_candidate_ops,std::map<const NPUOp *,bool> * is_visited)167 std::vector<NPUOp *> NPUGraph::FindReadySubgraphOps(std::queue<NPUOp *> op_queue,
168                                                     std::queue<NPUOp *> *next_candidate_ops,
169                                                     std::map<const NPUOp *, bool> *is_visited) {
170   std::vector<NPUOp *> subgraph_ops;
171   while (!op_queue.empty()) {
172     auto cur_op = op_queue.front();
173     op_queue.pop();
174     if ((*is_visited)[cur_op]) {
175       continue;
176     }
177     subgraph_ops.emplace_back(cur_op);
178     (*is_visited)[cur_op] = true;
179     auto out_ops = cur_op->out_ops();
180     for (auto out_op : out_ops) {
181       if ((*is_visited)[out_op]) {
182         continue;
183       }
184       auto input_ready = std::all_of(out_op->in_ops().begin(), out_op->in_ops().end(),
185                                      [&](NPUOp *in_op) { return (*is_visited)[in_op] == true; });
186       if (out_op->type() == schema::PrimitiveType_Transpose) {
187         next_candidate_ops->push(out_op);
188       } else if (input_ready) {
189         op_queue.push(out_op);
190       }
191     }
192   }
193   return subgraph_ops;
194 }
195 
FindConnectedOps(NPUOp * head_op,std::vector<NPUOp * > ready_ops,std::vector<NPUOp * > * connected_ops,std::map<const NPUOp *,bool> * is_searched)196 void FindConnectedOps(NPUOp *head_op, std::vector<NPUOp *> ready_ops, std::vector<NPUOp *> *connected_ops,
197                       std::map<const NPUOp *, bool> *is_searched) {
198   std::queue<NPUOp *> bfs_ops;
199   bfs_ops.push(head_op);
200   while (!bfs_ops.empty()) {
201     auto cur_op = bfs_ops.front();
202     bfs_ops.pop();
203     if ((*is_searched)[cur_op]) {
204       continue;
205     }
206     for (auto in_op : cur_op->in_ops()) {
207       if (std::find(ready_ops.begin(), ready_ops.end(), in_op) == ready_ops.end() || (*is_searched)[in_op]) {
208         continue;
209       }
210       bfs_ops.push(in_op);
211     }
212     for (auto out_op : cur_op->out_ops()) {
213       if (std::find(ready_ops.begin(), ready_ops.end(), out_op) == ready_ops.end() || (*is_searched)[out_op]) {
214         continue;
215       }
216       bfs_ops.push(out_op);
217     }
218     (*is_searched)[cur_op] = true;
219     connected_ops->emplace_back(cur_op);
220   }
221   return;
222 }
223 
CreateSubgraphFromReadyOps(std::queue<NPUOp * > * valid_in_ops,std::vector<NPUOp * > ready_ops,std::map<const NPUOp *,bool> * is_searched)224 int NPUGraph::CreateSubgraphFromReadyOps(std::queue<NPUOp *> *valid_in_ops, std::vector<NPUOp *> ready_ops,
225                                          std::map<const NPUOp *, bool> *is_searched) {
226   while (!valid_in_ops->empty()) {
227     std::vector<NPUOp *> connected_ops;
228     auto op = valid_in_ops->front();
229     valid_in_ops->pop();
230     if ((*is_searched)[op]) {
231       continue;
232     }
233     if (!valid_in_ops->empty()) {
234       // use BFS to find out connected input ops
235       FindConnectedOps(op, ready_ops, &connected_ops, is_searched);
236     } else {
237       // if current input op is the only input op, there is no need to confirm the connectivity
238       for (auto ready_op : ready_ops) {
239         if (!(*is_searched)[ready_op]) {
240           connected_ops.emplace_back(ready_op);
241           (*is_searched)[ready_op] = true;
242         }
243       }
244     }
245     auto subgraph_kernel = CreateNPUSubgraphKernel(connected_ops);
246     if (subgraph_kernel == nullptr) {
247       MS_LOG(DEBUG) << "Create NPU subgraph kernel failed.";
248       return RET_ERROR;
249     }
250     all_kernels_.emplace_back(subgraph_kernel);
251   }
252   return RET_OK;
253 }
254 
CreateNPUSubgraphKernel(std::vector<NPUOp * > npu_ops)255 kernel::Kernel *NPUGraph::CreateNPUSubgraphKernel(std::vector<NPUOp *> npu_ops) {
256   auto subgraph = new (std::nothrow) NPUSubGraph(npu_ops, npu_manager_);
257   if (subgraph == nullptr) {
258     MS_LOG(ERROR) << "New NPU Subgraph failed.";
259     return nullptr;
260   }
261   subgraph->set_inputs(lite::GetGraphInTensors(npu_ops));
262   // The output of NPUGraph should be assigned to the corresponding NPUSubgraph
263   auto subgraph_outputs = lite::GetGraphOutTensors(npu_ops);
264   for (auto graph_output : this->outputs()) {
265     for (auto subgraph_op : npu_ops) {
266       auto subgraph_op_outputs = subgraph_op->outputs();
267       if (find(subgraph_op_outputs.begin(), subgraph_op_outputs.end(), graph_output) != subgraph_op_outputs.end() &&
268           find(subgraph_outputs.begin(), subgraph_outputs.end(), graph_output) == subgraph_outputs.end()) {
269         subgraph_outputs.emplace_back(graph_output);
270         break;
271       }
272     }
273   }
274   subgraph->set_outputs(subgraph_outputs);
275   auto ret = subgraph->Init();
276   if (ret != RET_OK) {
277     MS_LOG(ERROR) << "NPU Subgraph Init failed.";
278     return nullptr;
279   }
280   return subgraph;
281 }
282 
CreateNPUTransposeKernel(NPUOp * op)283 kernel::Kernel *NPUGraph::CreateNPUTransposeKernel(NPUOp *op) {
284   if (op->type() != schema::PrimitiveType_Transpose) {
285     MS_LOG(ERROR) << "Check npu transpose op failed.";
286     return nullptr;
287   }
288   auto transpose_op = static_cast<TransposeNPUOp *>(op);
289   auto transpose_kernel = new (std::nothrow)
290     TransposeNPUKernel(transpose_op->inputs(), transpose_op->outputs(), transpose_op->GetPerm(), transpose_op->name());
291   if (transpose_kernel == nullptr) {
292     MS_LOG(ERROR) << "New npu transpose kernel failed.";
293     return nullptr;
294   }
295   return transpose_kernel;
296 }
297 
Prepare()298 int NPUGraph::Prepare() {
299   for (int i = 0; i < all_kernels_.size(); i++) {
300     auto ret = all_kernels_[i]->Prepare();
301     if (ret != RET_OK) {
302       MS_LOG(ERROR) << "NPU Subgraph " << all_kernels_[i]->name() << " prepare failed.";
303       return RET_ERROR;
304     }
305     for (auto output : all_kernels_[i]->outputs()) {
306       if (find(outputs_.begin(), outputs_.end(), output) == outputs_.end()) {
307         if (output.MutableData() == nullptr) {
308           MS_LOG(ERROR) << "NPU Subgraph " << all_kernels_[i]->name() << " prepare malloc output tensor failed.";
309           return RET_ERROR;
310         }
311       }
312     }
313   }
314   return RET_OK;
315 }
316 
Execute()317 int NPUGraph::Execute() {
318   for (int i = 0; i < all_kernels_.size(); i++) {
319     // 1. malloc graph output data
320     for (auto output : all_kernels_[i]->outputs()) {
321       if (find(outputs_.begin(), outputs_.end(), output) != outputs_.end()) {
322         if (output.MutableData() == nullptr) {
323           MS_LOG(ERROR) << "NPU Subgraph " << output.Name() << " execute malloc output tensor failed.";
324           return RET_ERROR;
325         }
326       }
327     }
328     // 2. execute
329     auto ret = all_kernels_[i]->Execute();
330     if (ret != RET_OK) {
331       MS_LOG(ERROR) << "NPU Subgraph " << all_kernels_[i]->name() << " execute failed.";
332       return RET_ERROR;
333     }
334   }
335   return RET_OK;
336 }
337 }  // namespace mindspore
338