1 /**
2 * Copyright 2020-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "src/litert/mindrt_executor.h"
17 #include <algorithm>
18 #include <list>
19 #include <queue>
20 #include <memory>
21 #include "src/litert/lite_mindrt.h"
22 #include "include/errorcode.h"
23 #include "src/common/common.h"
24 #include "src/common/tensor_util.h"
25 #ifdef ENABLE_FP16
26 #include "nnacl/base/cast_base.h"
27 #endif
28 #include "nnacl/nnacl_common.h"
29 #include "src/litert/kernel_exec_util.h"
30
31 namespace mindspore::lite {
32 namespace {
33 template <typename T>
MindrtAsyncRun(const std::vector<OpDataPtr<T>> & input_data,OpContext<T> * context,const std::shared_ptr<ActorMgr> & actor_mgr)34 Future<std::list<int>> MindrtAsyncRun(const std::vector<OpDataPtr<T>> &input_data, OpContext<T> *context,
35 const std::shared_ptr<ActorMgr> &actor_mgr) {
36 std::list<Future<int>> futures;
37 auto promises = *(context->results_);
38 (void)std::transform(promises.begin(), promises.end(), std::back_inserter(futures),
39 [](const Promise<int> &promise) { return promise.GetFuture(); });
40 Future<std::list<int>> collect = mindspore::Collect<int>(futures);
41
42 for (auto data : input_data) {
43 Async(data->op_id_, actor_mgr, &mindspore::OpActor<T>::RunOpData, data.get(), context);
44 }
45
46 return collect;
47 }
48
49 template <typename T>
MindrtRun(const std::vector<OpDataPtr<T>> & input_data,std::vector<OpDataPtr<T>> * output_data,const void * kernel_call_back_before,const void * kernel_call_back_after,const std::shared_ptr<ActorMgr> & actor_mgr)50 int MindrtRun(const std::vector<OpDataPtr<T>> &input_data, std::vector<OpDataPtr<T>> *output_data,
51 const void *kernel_call_back_before, const void *kernel_call_back_after,
52 const std::shared_ptr<ActorMgr> &actor_mgr) {
53 OpContext<T> context;
54 std::vector<Promise<int>> promises(output_data->size());
55 context.sequential_num_ = RandInt::Instance().Get();
56 context.results_ = &promises;
57 context.output_data_ = output_data;
58 context.kernel_call_back_before_ = kernel_call_back_before;
59 context.kernel_call_back_after_ = kernel_call_back_after;
60
61 auto collect = MindrtAsyncRun<T>(input_data, &context, actor_mgr);
62 collect.Wait();
63 if (!collect.IsOK()) {
64 return -1;
65 }
66
67 return 0;
68 }
69 } // namespace
70
PrepareGraphInput(const std::vector<kernel::KernelExec * > & kernels,const std::vector<Tensor * > & inputs)71 int MindrtExecutor::PrepareGraphInput(const std::vector<kernel::KernelExec *> &kernels,
72 const std::vector<Tensor *> &inputs) {
73 auto kernels_size = kernels.size();
74 for (size_t j = 0; j < kernels_size; ++j) {
75 auto in_tensor_size = kernels[j]->in_tensors().size();
76 for (size_t k = 0; k < in_tensor_size; ++k) {
77 auto tensor = kernels[j]->in_tensors()[k];
78 if (!tensor->IsGraphInput()) {
79 // for that extendrt create isolated_input_map_ outside of executor
80 auto input = isolate_input_map_->find(tensor);
81 if (input == isolate_input_map_->end() || !input->second->IsGraphInput()) {
82 continue;
83 }
84 tensor = input->second;
85 }
86 size_t idx = std::find(inputs.begin(), inputs.end(), tensor) - inputs.begin();
87 if (idx == inputs.size()) {
88 MS_LOG(ERROR) << "The input is not found.";
89 return RET_ERROR;
90 }
91 auto data = std::make_shared<OpData<Tensor>>(op_actors_[j]->GetAID(), inputs.at(idx), static_cast<int>(k));
92 if (MS_UNLIKELY(data == nullptr)) {
93 MS_LOG(ERROR) << "new opdata failed.";
94 return RET_NULL_PTR;
95 }
96 (void)input_data_.emplace_back(data);
97 }
98 }
99 return RET_OK;
100 }
101
PrepareGraphOutput(const std::vector<kernel::KernelExec * > & kernels,const std::vector<Tensor * > & outputs)102 int MindrtExecutor::PrepareGraphOutput(const std::vector<kernel::KernelExec *> &kernels,
103 const std::vector<Tensor *> &outputs) {
104 auto outputs_size = outputs.size();
105 for (size_t i = 0; i < outputs_size; ++i) {
106 Tensor *graph_output_tensor = outputs[i];
107 if (graph_output_tensor->IsGraphInput()) {
108 continue;
109 }
110 auto current_output_map =
111 std::find_if(isolate_output_map_->begin(), isolate_output_map_->end(), [&](const auto output_map_tensor) {
112 if (graph_output_tensor == output_map_tensor.second) {
113 return true;
114 }
115 return false;
116 });
117 MS_ASSERT(current_output_map != isolate_output_map_->end());
118 Tensor *subgraph_output_tensor = current_output_map->first;
119 auto kernels_size = kernels.size();
120 for (size_t j = 0; j < kernels_size; ++j) {
121 auto out_tensor_size = kernels[j]->out_tensors().size();
122 for (size_t k = 0; k < out_tensor_size; ++k) {
123 if (subgraph_output_tensor != kernels[j]->out_tensors()[k]) {
124 continue;
125 }
126 auto data =
127 std::make_shared<OpData<Tensor>>(op_actors_[j]->GetAID(), subgraph_output_tensor, static_cast<int>(k));
128 if (MS_UNLIKELY(data == nullptr)) {
129 MS_LOG(ERROR) << "new opdata failed.";
130 return RET_NULL_PTR;
131 }
132 op_actors_[j]->AddResultIndex(output_data_.size(), k);
133 (void)output_data_.emplace_back(data);
134 }
135 }
136 }
137 if (output_data_.empty()) {
138 MS_LOG(ERROR) << "output_data_ can not be empty.";
139 return RET_ERROR;
140 }
141 return RET_OK;
142 }
143
Resize(const std::vector<mindspore::lite::Tensor * > & inputs,const std::vector<std::vector<int>> & dims)144 int MindrtExecutor::Resize(const std::vector<mindspore::lite::Tensor *> &inputs,
145 const std::vector<std::vector<int>> &dims) {
146 for (auto actor : op_actors_) {
147 actor->ResizeGraphInput(inputs, dims);
148 }
149 return RET_OK;
150 }
151
PreInitActors()152 int MindrtExecutor::PreInitActors() {
153 // for that extendrt create isolated_input_map_ outside of executor
154 if (!isolate_input_map_->empty()) {
155 for (auto &iter : *isolate_input_map_) {
156 ctx_->SetLinkInfo(iter.second, iter.first);
157 }
158 for (const auto &actor : op_actors_) {
159 actor->set_isolate_input_map(isolate_input_map_);
160 }
161 return RET_OK;
162 }
163 for (const auto &actor : op_actors_) {
164 int ret = actor->PreInit(&op_actors_, isolate_input_map_);
165 if (ret != RET_OK) {
166 MS_LOG(ERROR) << "IsolateInputData failed, actor aid: " << actor->GetAID();
167 return ret;
168 }
169 }
170 return RET_OK;
171 }
172
BuildReceiverMap()173 std::unordered_map<void *, std::set<std::pair<AID, size_t>>> MindrtExecutor::BuildReceiverMap() {
174 std::unordered_map<void *, std::set<std::pair<AID, size_t>>> receivers_map{};
175
176 for (auto op_actor : op_actors_) {
177 auto input_tensors = op_actor->GetKernel()->in_tensors();
178 for (size_t i = 0; i < input_tensors.size(); ++i) {
179 auto key = input_tensors[i];
180 auto pair = std::make_pair(op_actor->GetAID(), i);
181 auto iter = receivers_map.find(key);
182 if (iter != receivers_map.end()) {
183 (void)iter->second.emplace(pair);
184 } else {
185 std::set<std::pair<AID, size_t>> tmp_set{pair};
186 receivers_map[input_tensors[i]] = tmp_set;
187 }
188 }
189 }
190 return receivers_map;
191 }
192
LinkActors()193 int MindrtExecutor::LinkActors() {
194 auto receivers_map = BuildReceiverMap();
195 for (auto &&op_actor : op_actors_) {
196 auto ret = op_actor->CompileArrow(receivers_map);
197 if (ret != RET_OK) {
198 MS_LOG(ERROR) << "actor: " << op_actor->GetAID() << " compile arrow failed.";
199 return ret;
200 }
201 }
202 return RET_OK;
203 }
204
PostInitActors()205 int MindrtExecutor::PostInitActors() {
206 for (auto &&actor : op_actors_) {
207 auto ret = actor->PostInit();
208 if (ret != RET_OK) {
209 MS_LOG(ERROR) << "PrepareGraphOutput failed, actor aid: " << actor->GetAID();
210 return ret;
211 }
212 }
213 return RET_OK;
214 }
215
Prepare(const std::vector<kernel::KernelExec * > & kernels,const std::vector<Tensor * > & inputs,const std::vector<Tensor * > & outputs,lite::InnerContext * ctx)216 int MindrtExecutor::Prepare(const std::vector<kernel::KernelExec *> &kernels, const std::vector<Tensor *> &inputs,
217 const std::vector<Tensor *> &outputs, lite::InnerContext *ctx) {
218 MS_ASSERT(ctx != nullptr);
219 ctx_ = ctx;
220 actor_mgr_ = std::make_shared<ActorMgr>();
221 if (actor_mgr_ == nullptr) {
222 MS_LOG(ERROR) << "make_shared ActorMgr failed!";
223 return RET_ERROR;
224 }
225
226 op_actors_ = CreateOpActor(kernels, ctx, actor_mgr_);
227 if (op_actors_.size() != kernels.size()) {
228 MS_LOG(ERROR) << "CreateOpActor failed!actor num: " << op_actors_.size() << ", kernels num: " << kernels.size();
229 return RET_ERROR;
230 }
231
232 auto ret = PrepareGraphInput(kernels, inputs);
233 if (ret != RET_OK) {
234 MS_LOG(ERROR) << "PrepareGraphInput failed!ret: " << ret;
235 return ret;
236 }
237
238 ret = PrepareGraphOutput(kernels, outputs);
239 if (ret != RET_OK) {
240 MS_LOG(ERROR) << "PrepareGraphOutput failed!ret: " << ret;
241 return ret;
242 }
243
244 ret = PreInitActors();
245 if (ret != RET_OK) {
246 MS_LOG(ERROR) << "PreInitActors failed!ret: " << ret;
247 return ret;
248 }
249
250 ret = LinkActors();
251 if (ret != RET_OK) {
252 MS_LOG(ERROR) << "LinkActors failed!ret: " << ret;
253 return ret;
254 }
255
256 ret = PostInitActors();
257 if (ret != RET_OK) {
258 MS_LOG(ERROR) << "PostInitActors failed!ret: " << ret;
259 return ret;
260 }
261 return RET_OK;
262 }
263
TransferGraphOutput()264 int MindrtExecutor::TransferGraphOutput() {
265 for (auto tensor_map : *isolate_output_map_) {
266 auto dst_tensor = tensor_map.second;
267 auto src_tensor = tensor_map.first;
268 if (dst_tensor->data_type() == kNumberTypeGLUInt && src_tensor->data_type() == kNumberTypeGLUInt) {
269 continue;
270 }
271 dst_tensor->set_shape(src_tensor->shape());
272 /* dst tensor free in FreeOutputTensor */
273 if (src_tensor->data_type() == kNumberTypeFloat16 && dst_tensor->data_type() == kNumberTypeFloat32) {
274 auto ret = dst_tensor->MallocData();
275 if (ret != RET_OK) {
276 MS_LOG(ERROR) << "MallocData failed";
277 return ret;
278 }
279 #ifdef ENABLE_FP16
280 Fp16ToFloat32(reinterpret_cast<float16_t *>(src_tensor->MutableData()),
281 reinterpret_cast<float *>(dst_tensor->data()), dst_tensor->ElementsNum());
282 #else
283 auto src_data = reinterpret_cast<const uint16_t *>(src_tensor->MutableData());
284 auto dst_data = reinterpret_cast<float *>(dst_tensor->data());
285 for (int i = 0; i < dst_tensor->ElementsNum(); i++) {
286 dst_data[i] = ShortToFloat32(src_data[i]);
287 }
288 #endif
289 } else {
290 if (dst_tensor->allocator() != src_tensor->allocator()) {
291 dst_tensor->set_allocator(src_tensor->allocator());
292 }
293 if (src_tensor->allocator() != nullptr) {
294 dst_tensor->set_data(src_tensor->data());
295 dst_tensor->set_own_data(src_tensor->IsConst() ? false : src_tensor->own_data());
296 } else {
297 dst_tensor->set_data(src_tensor->data());
298 src_tensor->set_data(nullptr);
299 }
300 }
301 src_tensor->DecRefCount();
302 }
303 return RET_OK;
304 }
305
FreeOutputTensor()306 void MindrtExecutor::FreeOutputTensor() {
307 for (auto &&tensor_map : *isolate_output_map_) {
308 auto src_tensor = tensor_map.first;
309 auto dst_tensor = tensor_map.second;
310 if (dst_tensor->data_type() == kNumberTypeGLUInt && src_tensor->data_type() == kNumberTypeGLUInt) {
311 continue;
312 }
313
314 if ((dst_tensor->allocator() != nullptr && dst_tensor->own_data()) || dst_tensor->data() == nullptr) {
315 MS_LOG(DEBUG) << "free data";
316 dst_tensor->FreeData();
317 } else if (dst_tensor->data() != nullptr && dst_tensor->data_type() == src_tensor->data_type()) {
318 if (dst_tensor->allocator() == nullptr) {
319 /* user set graph-output-tensor from outside */
320 MS_LOG(DEBUG) << "user set graph-output-tensor from outside";
321 src_tensor->set_data(dst_tensor->data());
322 src_tensor->set_own_data(false);
323 src_tensor->set_allocator(nullptr);
324 } else if (dst_tensor->allocator() == src_tensor->allocator()) {
325 /* nnrt npu zero copy scene */
326 MS_LOG(DEBUG) << "zero copy data";
327 src_tensor->set_data(dst_tensor->data());
328 src_tensor->set_own_data(dst_tensor->own_data());
329 }
330 }
331 }
332 return;
333 }
334
Run(const std::vector<Tensor * > & in_tensors,const std::vector<Tensor * > & out_tensors,const std::vector<kernel::KernelExec * > & kernels,const KernelCallBack & before,const KernelCallBack & after)335 int MindrtExecutor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
336 const std::vector<kernel::KernelExec *> &kernels, const KernelCallBack &before,
337 const KernelCallBack &after) {
338 CHECK_NULL_RETURN(ctx_);
339 auto thread_pool = ctx_->thread_pool_;
340 CHECK_NULL_RETURN(thread_pool);
341 if (ctx_->delegate == nullptr) {
342 thread_pool->SetSpinCountMaxValue();
343 }
344
345 FreeOutputTensor();
346
347 auto ret = MindrtRun<Tensor>(input_data_, &output_data_, &before, &after, actor_mgr_);
348 if (ret != RET_OK) {
349 MS_LOG(ERROR) << "MindrtRun failed";
350 return ret;
351 }
352
353 ret = TransferGraphOutput();
354 if (ret != RET_OK) {
355 MS_LOG(ERROR) << "TransferGraphOutput failed";
356 return ret;
357 }
358
359 thread_pool->SetSpinCountMinValue();
360 return RET_OK;
361 }
362 } // namespace mindspore::lite
363