1From 464f222815cd1fed680d91569b29da5d6b4c281c Mon Sep 17 00:00:00 2001 2From: qinzheng4 <qinzheng4@huawei.com> 3Date: Mon, 19 Feb 2024 15:22:17 +0800 4Subject: [PATCH] 0009-npu-zero-copy 5 6--- 7 include/c_api/tensor_c.h | 15 ++ 8 mindspore/lite/BUILD.gn | 1 + 9 mindspore/lite/src/litert/c_api/model_c.cc | 40 ++++- 10 mindspore/lite/src/litert/c_api/tensor_c.cc | 32 ++++ 11 .../lite/src/litert/c_api/type_c_private.h | 3 + 12 .../src/litert/cxx_api/model/model_impl.cc | 72 +++++++- 13 .../litert/delegate/nnrt/nnrt_allocator.cc | 168 ++++++++++++++++++ 14 .../src/litert/delegate/nnrt/nnrt_allocator.h | 64 +++++++ 15 .../litert/delegate/nnrt/nnrt_model_kernel.cc | 50 +++++- 16 .../litert/delegate/nnrt/nnrt_model_kernel.h | 3 + 17 .../litert/kernel/cpu/nnacl/nnacl_kernel.cc | 2 +- 18 mindspore/lite/src/litert/mindrt_executor.cc | 14 +- 19 12 files changed, 453 insertions(+), 11 deletions(-) 20 create mode 100644 mindspore/lite/src/litert/delegate/nnrt/nnrt_allocator.cc 21 create mode 100644 mindspore/lite/src/litert/delegate/nnrt/nnrt_allocator.h 22 23diff --git a/include/c_api/tensor_c.h b/include/c_api/tensor_c.h 24index 6d2aaab6..2f641725 100644 25--- a/include/c_api/tensor_c.h 26+++ b/include/c_api/tensor_c.h 27@@ -154,6 +154,21 @@ OH_AI_API int64_t OH_AI_TensorGetElementNum(const OH_AI_TensorHandle tensor); 28 /// \return The data size of the tensor. 29 OH_AI_API size_t OH_AI_TensorGetDataSize(const OH_AI_TensorHandle tensor); 30 31+/// \brief Obtain allocator for the tensor. 32+/// 33+/// \param[in] tensor Tensor object handle. 34+/// 35+/// \return The pointer of allocator. 36+OH_AI_API void *OH_AI_TensorGetAllocator(OH_AI_TensorHandle tensor); 37+ 38+/// \brief Set allocator for the tensor. 39+/// 40+/// \param[in] tensor Tensor object handle. 41+/// \param[in] allocator A pointer to the allocator. 42+/// 43+/// \return OH_AI_STATUS_SUCCESS if success, or detail error code if failed. 44+OH_AI_API OH_AI_Status OH_AI_TensorSetAllocator(OH_AI_TensorHandle tensor, void *allocator); 45+ 46 #ifdef __cplusplus 47 } 48 #endif 49diff --git a/mindspore/lite/BUILD.gn b/mindspore/lite/BUILD.gn 50index 4a83f498..723df1ec 100644 51--- a/mindspore/lite/BUILD.gn 52+++ b/mindspore/lite/BUILD.gn 53@@ -443,6 +443,7 @@ ohos_shared_library("mindspore_lib") { 54 "src/litert/delegate/nnrt/checker/primitive_check.cc", 55 "src/litert/delegate/nnrt/nnrt_delegate.cc", 56 "src/litert/delegate/nnrt/nnrt_model_kernel.cc", 57+ "src/litert/delegate/nnrt/nnrt_allocator.cc", 58 ] 59 include_dirs += [ 60 "src/delegate/nnrt/include", 61diff --git a/mindspore/lite/src/litert/c_api/model_c.cc b/mindspore/lite/src/litert/c_api/model_c.cc 62index 9da52d76..20e1c227 100644 63--- a/mindspore/lite/src/litert/c_api/model_c.cc 64+++ b/mindspore/lite/src/litert/c_api/model_c.cc 65@@ -14,6 +14,7 @@ 66 * limitations under the License. 67 */ 68 #include "include/c_api/model_c.h" 69+#include "type_c_private.h" 70 #include <vector> 71 #include <cstdint> 72 #include "include/api/context.h" 73@@ -37,6 +38,11 @@ public: 74 for (auto out : outputs_train_) { 75 delete out; 76 } 77+ 78+ // In zero copy scene where user will call set or get allocator function, but when model is destroyed, the allocator 79+ // table will not be freed, and its size continues to grow causing memory leak, so when ModelC is destroyed, clean 80+ // allocator table. 81+ CleanAllocatorTable(); 82 } 83 84 MSTensor **GetInputs(size_t *input_num); 85@@ -246,10 +252,42 @@ OH_AI_Status OH_AI_ModelPredict(OH_AI_ModelHandle model, const OH_AI_TensorHandl 86 mindspore::MSKernelCallBack after_call_back = impl->TransCallBack(after); 87 88 std::vector<mindspore::MSTensor> ms_tensor_outputs; 89+ 90+ bool all_has_data = false; 91+ 92+ size_t output_num; 93+ (void)impl->GetOutputs(&output_num); 94+ auto handle_num = outputs->handle_num; 95+ if (handle_num == output_num) { 96+ MS_LOG(INFO) << "use user provided output"; 97+ for (size_t i = 0; i < output_num; i++) { 98+ if (outputs->handle_list[i] == nullptr) { 99+ MS_LOG(ERROR) << "user provided output array handle_list[" << i << "] is nullptr"; 100+ return OH_AI_STATUS_LITE_NULLPTR; 101+ } 102+ ms_tensor_outputs.push_back(*static_cast<mindspore::MSTensor *>(outputs->handle_list[i])); 103+ } 104+ 105+ all_has_data = std::all_of(ms_tensor_outputs.begin(), ms_tensor_outputs.end(), [](const mindspore::MSTensor &t) { 106+ return t.Data() != nullptr; 107+ }); 108+ 109+ if (!all_has_data) { 110+ ms_tensor_outputs.clear(); 111+ } 112+ 113+ } 114+ 115 auto ret = impl->model_->Predict(ms_tensor_inputs, &ms_tensor_outputs, before_call_back, after_call_back); 116 if (!ret.IsOk()) { 117 MS_LOG(ERROR) << "Predict fail, ret :" << ret; 118+ return static_cast<OH_AI_Status>(ret.StatusCode()); 119 } 120+ 121+ if (handle_num == output_num && all_has_data) { 122+ return OH_AI_STATUS_SUCCESS; 123+ } 124+ 125 outputs->handle_list = reinterpret_cast<OH_AI_TensorHandle *>(impl->GetOutputs(&outputs->handle_num)); 126 return static_cast<OH_AI_Status>(ret.StatusCode()); 127 } 128@@ -345,7 +383,7 @@ char **OH_AI_TrainCfgGetLossName(OH_AI_TrainCfgHandle train_cfg, size_t *num) { 129 auto impl = static_cast<mindspore::TrainCfg *>(train_cfg); 130 auto loss_name = impl->GetLossName(); 131 *num = loss_name.size(); 132- char **name = static_cast<char **>(malloc(loss_name.size())); 133+ char **name = static_cast<char **>(malloc(loss_name.size() * sizeof(char *))); 134 if (name == nullptr) { 135 MS_LOG(ERROR) << "Failed to malloc loss_name."; 136 return nullptr; 137diff --git a/mindspore/lite/src/litert/c_api/tensor_c.cc b/mindspore/lite/src/litert/c_api/tensor_c.cc 138index 4b1e6aff..fc3814dd 100644 139--- a/mindspore/lite/src/litert/c_api/tensor_c.cc 140+++ b/mindspore/lite/src/litert/c_api/tensor_c.cc 141@@ -13,11 +13,18 @@ 142 * See the License for the specific language governing permissions and 143 * limitations under the License. 144 */ 145+#include <unordered_map> 146 #include "include/c_api/tensor_c.h" 147 #include "include/api/status.h" 148 #include "src/tensor.h" 149 #include "src/litert/cxx_api/tensor/tensor_impl.h" 150 151+static std::unordered_map<void *, std::weak_ptr<mindspore::Allocator>> allocator_table; 152+ 153+void CleanAllocatorTable() { 154+ allocator_table.clear(); 155+} 156+ 157 OH_AI_TensorHandle OH_AI_TensorCreate(const char *name, OH_AI_DataType type, const int64_t *shape, size_t shape_num, 158 const void *data, size_t data_len) { 159 if (name == nullptr || shape == nullptr) { 160@@ -208,3 +215,28 @@ size_t OH_AI_TensorGetDataSize(const OH_AI_TensorHandle tensor) { 161 auto impl = static_cast<mindspore::MSTensor *>(tensor); 162 return impl->DataSize(); 163 } 164+ 165+OH_AI_Status OH_AI_TensorSetAllocator(OH_AI_TensorHandle tensor, void *allocator) { 166+ if (tensor == nullptr) { 167+ MS_LOG(ERROR) << "param is nullptr."; 168+ return OH_AI_STATUS_LITE_NULLPTR; 169+ } 170+ auto impl = static_cast<mindspore::MSTensor *>(tensor); 171+ if (allocator_table.count(allocator) == 0) { 172+ MS_LOG(ERROR) << "the input allocator does not belong to framework"; 173+ return OH_AI_STATUS_LITE_PARAM_INVALID; 174+ } 175+ std::static_pointer_cast<mindspore::LiteTensorImpl>(impl->impl())->set_own_data(true); 176+ impl->SetAllocator(allocator_table[allocator].lock()); 177+ return OH_AI_STATUS_SUCCESS; 178+} 179+ 180+void *OH_AI_TensorGetAllocator(const OH_AI_TensorHandle tensor) { 181+ if (tensor == nullptr) { 182+ MS_LOG(ERROR) << "param is nullptr."; 183+ return nullptr; 184+ } 185+ auto impl = static_cast<mindspore::MSTensor *>(tensor); 186+ allocator_table[impl->allocator().get()] = impl->allocator(); 187+ return impl->allocator().get(); 188+} 189diff --git a/mindspore/lite/src/litert/c_api/type_c_private.h b/mindspore/lite/src/litert/c_api/type_c_private.h 190index 2d3b3883..1a76820d 100644 191--- a/mindspore/lite/src/litert/c_api/type_c_private.h 192+++ b/mindspore/lite/src/litert/c_api/type_c_private.h 193@@ -36,5 +36,8 @@ struct NNRTDeviceDesc { 194 195 #ifdef __cplusplus 196 } 197+ 198+void CleanAllocatorTable(); 199+ 200 #endif 201 #endif // MINDSPORE_LITE_SRC_LITERT_C_API_TYPE_C_PRIVATE_H_ 202diff --git a/mindspore/lite/src/litert/cxx_api/model/model_impl.cc b/mindspore/lite/src/litert/cxx_api/model/model_impl.cc 203index 78b1ca67..5d1b78a2 100644 204--- a/mindspore/lite/src/litert/cxx_api/model/model_impl.cc 205+++ b/mindspore/lite/src/litert/cxx_api/model/model_impl.cc 206@@ -463,7 +463,55 @@ Status ModelImpl::Predict(const std::vector<MSTensor> &inputs, std::vector<MSTen 207 input->set_shape(truncate_shape); 208 #endif 209 } 210- input->set_data(user_input.MutableData()); 211+ if (user_input.allocator() == input->allocator()) { 212+ input->set_data(user_input.MutableData()); 213+ input->set_own_data(false); 214+ } else { 215+ void *user_data = user_input.MutableData(); 216+ if (user_data == nullptr) { 217+ MS_LOG(ERROR) << "user data is nullptr"; 218+ return kLiteNullptr; 219+ } 220+ void *input_data = input->MutableData(); 221+ if (input_data == nullptr) { 222+ MS_LOG(ERROR) << "input data is nullptr"; 223+ return kLiteNullptr; 224+ } 225+ memcpy(input_data, user_data, input->Size()); 226+ } 227+ } 228+ } 229+ } 230+ 231+ auto ori_output_tensors = GetOutputs(); 232+ std::vector<bool> copy_output_data; 233+ copy_output_data.resize(ori_output_tensors.size(), false); 234+ if (outputs->empty()) { 235+ MS_LOG(INFO) << "user provided output is empty"; 236+ } else if (outputs->size() != ori_output_tensors.size()) { 237+ MS_LOG(ERROR) << "user provided output size is not equal to model's output size"; 238+ return kLiteError; 239+ } else { 240+ for (size_t i = 0; i < ori_output_tensors.size(); i++) { 241+ auto ori_output = ori_output_tensors[i]; 242+ auto lite_impl = std::static_pointer_cast<LiteTensorImpl>(ori_output.impl()); 243+ MS_CHECK_TRUE_RET(lite_impl != nullptr, kLiteNullptr); 244+ auto ori_out_tensor = static_cast<lite::Tensor *>(lite_impl->lite_tensor()); 245+ MS_CHECK_TRUE_RET(ori_out_tensor != nullptr, kLiteNullptr); 246+ 247+ auto user_output = (*outputs)[i]; 248+ if (ori_output.impl() == user_output.impl()) { 249+ continue; 250+ } 251+ 252+ auto user_out_data = user_output.MutableData(); 253+ MS_CHECK_TRUE_RET(user_out_data != nullptr, kLiteNullptr); 254+ if (ori_out_tensor->allocator() == user_output.allocator()) { 255+ MS_LOG(INFO) << "use user data"; 256+ ori_out_tensor->set_data(user_out_data); 257+ ori_out_tensor->set_own_data(false); 258+ } else { 259+ copy_output_data[i] = true; 260 } 261 } 262 } 263@@ -474,6 +522,28 @@ Status ModelImpl::Predict(const std::vector<MSTensor> &inputs, std::vector<MSTen 264 return ret; 265 } 266 MS_LOG(DEBUG) << "Run graph success."; 267+ 268+ for (size_t i = 0; i < copy_output_data.size(); i++) { 269+ if (!copy_output_data[i]) { 270+ continue; 271+ } 272+ auto ori_output = ori_output_tensors[i]; 273+ auto ori_out_data = ori_output.MutableData(); 274+ MS_CHECK_TRUE_RET(ori_out_data != nullptr, kLiteNullptr); 275+ auto user_output = (*outputs)[i]; 276+ MS_CHECK_TRUE_RET(user_output.MutableData() != nullptr, kLiteNullptr); 277+ if (user_output.DataSize() >= ori_output.DataSize()) { 278+ memcpy(user_output.MutableData(), ori_out_data, ori_output.DataSize()); 279+ } else { 280+ MS_LOG(ERROR) << "user out data size is less than model's output data size"; 281+ return kLiteError; 282+ } 283+ } 284+ 285+ if (outputs->size() == ori_output_tensors.size()) { 286+ return kSuccess; 287+ } 288+ 289 auto res = GetOutputs(); 290 if (res.empty()) { 291 MS_LOG(DEBUG) << "Empty outputs."; 292diff --git a/mindspore/lite/src/litert/delegate/nnrt/nnrt_allocator.cc b/mindspore/lite/src/litert/delegate/nnrt/nnrt_allocator.cc 293new file mode 100644 294index 00000000..f79c1682 295--- /dev/null 296+++ b/mindspore/lite/src/litert/delegate/nnrt/nnrt_allocator.cc 297@@ -0,0 +1,168 @@ 298+/** 299+ * Copyright 2023 Huawei Technologies Co., Ltd 300+ * 301+ * Licensed under the Apache License, Version 2.0 (the "License"); 302+ * you may not use this file except in compliance with the License. 303+ * You may obtain a copy of the License at 304+ * 305+ * http://www.apache.org/licenses/LICENSE-2.0 306+ * 307+ * Unless required by applicable law or agreed to in writing, software 308+ * distributed under the License is distributed on an "AS IS" BASIS, 309+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 310+ * See the License for the specific language governing permissions and 311+ * limitations under the License. 312+ */ 313+ 314+#include <memory> 315+#include <atomic> 316+#include <unordered_map> 317+#include <map> 318+#include <mutex> 319+#include "src/litert/delegate/nnrt/nnrt_allocator.h" 320+#include "src/common/log.h" 321+#include "interfaces/kits/c/neural_network_runtime/neural_network_runtime.h" 322+ 323+namespace mindspore { 324+namespace lite { 325+NNRTAllocator::~NNRTAllocator() { 326+ std::lock_guard<std::mutex> locker(mutex_); 327+ for (auto &it : allocated_list_) { 328+ auto membuf = it.second; 329+ if (memory_category_ == NNRT_INPUT) { 330+ OH_NNExecutor_DestroyInputMemory(executor_, index_, &(membuf->memory_)); 331+ } else { 332+ OH_NNExecutor_DestroyOutputMemory(executor_, index_, &(membuf->memory_)); 333+ } 334+ free(membuf); 335+ } 336+ allocated_list_.clear(); 337+ 338+ for (auto &it : free_list_) { 339+ auto membuf = it.second; 340+ if (memory_category_ == NNRT_INPUT) { 341+ OH_NNExecutor_DestroyInputMemory(executor_, index_, &(membuf->memory_)); 342+ } else { 343+ OH_NNExecutor_DestroyOutputMemory(executor_, index_, &(membuf->memory_)); 344+ } 345+ free(membuf); 346+ } 347+ free_list_.clear(); 348+} 349+ 350+void *NNRTAllocator::Malloc(size_t size) { 351+ std::lock_guard<std::mutex> locker(mutex_); 352+ auto iter = free_list_.lower_bound(size); 353+ if (iter != free_list_.end()) { 354+ auto membuf = iter->second; 355+ membuf->ref_count_ = 0; 356+ (void)free_list_.erase(iter); 357+ allocated_list_[membuf->memory_->data] = membuf; 358+ return membuf->memory_->data; 359+ } 360+ 361+ auto membuf = new (std::nothrow) MemBuf(); 362+ if (membuf == nullptr) { 363+ MS_LOG(ERROR) << "new Membuf failed."; 364+ return nullptr; 365+ } 366+ 367+ membuf->ref_count_ = 0; 368+ if (memory_category_ == NNRT_INPUT) { 369+ membuf->memory_ = OH_NNExecutor_AllocateInputMemory(executor_, index_, size); 370+ } else { 371+ membuf->memory_ = OH_NNExecutor_AllocateOutputMemory(executor_, index_, size); 372+ } 373+ 374+ if (membuf->memory_ == nullptr) { 375+ MS_LOG(ERROR) << "malloc OH_NN_Memory return nullptr"; 376+ return nullptr; 377+ } 378+ if (membuf->memory_->data == nullptr) { 379+ MS_LOG(ERROR) << "malloc OH_NN_Memory return nullptr"; 380+ if (memory_category_ == NNRT_INPUT) { 381+ OH_NNExecutor_DestroyInputMemory(executor_, index_, &(membuf->memory_)); 382+ } else { 383+ OH_NNExecutor_DestroyOutputMemory(executor_, index_, &(membuf->memory_)); 384+ } 385+ return nullptr; 386+ } 387+ 388+ allocated_list_[membuf->memory_->data] = membuf; 389+ return membuf->memory_->data; 390+} 391+ 392+void NNRTAllocator::Free(void *ptr) { 393+ if (ptr == nullptr) { 394+ return; 395+ } 396+ 397+ std::lock_guard<std::mutex> locker(mutex_); 398+ auto iter = allocated_list_.find(ptr); 399+ if (iter == allocated_list_.end()) { 400+ return; 401+ } 402+ auto membuf = iter->second; 403+ membuf->ref_count_ = 0; 404+ (void)allocated_list_.erase(iter); 405+ (void)free_list_.insert(std::make_pair(membuf->memory_->length, membuf)); 406+} 407+ 408+int NNRTAllocator::RefCount(void *ptr) { 409+ if (ptr == nullptr) { 410+ return -1; 411+ } 412+ std::lock_guard<std::mutex> locker(mutex_); 413+ auto iter = allocated_list_.find(ptr); 414+ if (iter != allocated_list_.end()) { 415+ auto membuf = iter->second; 416+ int ref_count = std::atomic_load(&membuf->ref_count_); 417+ return ref_count; 418+ } 419+ return -1; 420+} 421+ 422+int NNRTAllocator::SetRefCount(void *ptr, int ref_count) { 423+ if (ptr == nullptr) { 424+ return -1; 425+ } 426+ std::lock_guard<std::mutex> locker(mutex_); 427+ auto iter = allocated_list_.find(ptr); 428+ if (iter != allocated_list_.end()) { 429+ auto membuf = iter->second; 430+ std::atomic_store(&membuf->ref_count_, ref_count); 431+ return ref_count; 432+ } 433+ return -1; 434+} 435+ 436+int NNRTAllocator::DecRefCount(void *ptr, int ref_count) { 437+ if (ptr == nullptr) { 438+ return -1; 439+ } 440+ std::lock_guard<std::mutex> locker(mutex_); 441+ auto iter = allocated_list_.find(ptr); 442+ if (iter != allocated_list_.end()) { 443+ auto membuf = iter->second; 444+ auto ref = std::atomic_fetch_sub(&membuf->ref_count_, ref_count); 445+ return ref; 446+ } 447+ return -1; 448+} 449+ 450+int NNRTAllocator::IncRefCount(void *ptr, int ref_count) { 451+ if (ptr == nullptr) { 452+ return -1; 453+ } 454+ std::lock_guard<std::mutex> locker(mutex_); 455+ auto iter = allocated_list_.find(ptr); 456+ if (iter != allocated_list_.end()) { 457+ auto membuf = iter->second; 458+ auto ref = std::atomic_fetch_add(&membuf->ref_count_, ref_count); 459+ return ref; 460+ } 461+ return -1; 462+} 463+ 464+} // namespace lite 465+} // namespace mindspore 466\ No newline at end of file 467diff --git a/mindspore/lite/src/litert/delegate/nnrt/nnrt_allocator.h b/mindspore/lite/src/litert/delegate/nnrt/nnrt_allocator.h 468new file mode 100644 469index 00000000..f6721369 470--- /dev/null 471+++ b/mindspore/lite/src/litert/delegate/nnrt/nnrt_allocator.h 472@@ -0,0 +1,64 @@ 473+/** 474+* Copyright 2023 Huawei Technologies Co., Ltd 475+* 476+* Licensed under the Apache License, Version 2.0 (the "License"); 477+* you may not use this file except in compliance with the License. 478+* You may obtain a copy of the License at 479+* 480+* http://www.apache.org/licenses/LICENSE-2.0 481+* 482+* Unless required by applicable law or agreed to in writing, software 483+* distributed under the License is distributed on an "AS IS" BASIS, 484+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 485+* See the License for the specific language governing permissions and 486+* limitations under the License. 487+ */ 488+#ifndef MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NNRT_NNRT_ALLOCATOR_H_ 489+#define MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NNRT_NNRT_ALLOCATOR_H_ 490+ 491+#include <vector> 492+#include <map> 493+#include <atomic> 494+#include <unordered_map> 495+#include <map> 496+#include <mutex> 497+#include "include/api/allocator.h" 498+struct OH_NN_Memory; 499+struct OH_NNExecutor; 500+ 501+namespace mindspore { 502+namespace lite { 503+enum MemoryCategory { NNRT_INPUT, NNRT_OUTPUT }; 504+ 505+class NNRTAllocator : public Allocator { 506+ public: 507+ NNRTAllocator(OH_NNExecutor *executor, int index, MemoryCategory memory_category) 508+ : index_(index), memory_category_(memory_category), executor_(executor) {} 509+ ~NNRTAllocator() override; 510+ 511+ void *Malloc(size_t size) override; 512+ void Free(void *ptr) override; 513+ int RefCount(void *ptr) override; 514+ int SetRefCount(void *ptr, int ref_count) override; 515+ int DecRefCount(void *ptr, int ref_count) override; 516+ int IncRefCount(void *ptr, int ref_count) override; 517+ 518+ private: 519+ struct MemBuf { 520+ std::atomic_int ref_count_{0}; 521+ OH_NN_Memory *memory_{nullptr}; 522+ }; 523+ 524+ int index_{0}; 525+ MemoryCategory memory_category_{NNRT_INPUT}; 526+ OH_NNExecutor *executor_{nullptr}; 527+ std::mutex mutex_; 528+ // <membuf->memory_->data, membuf> 529+ std::unordered_map<void *, MemBuf *> allocated_list_; 530+ std::multimap<size_t, MemBuf *> free_list_; 531+}; 532+ 533+} // namespace lite 534+} // namespace mindspore 535+ 536+#endif // MINDSPORE_LITE_SRC_RUNTIME_DELEGATE_NNRT_NNRT_ALLOCATOR_H_ 537\ No newline at end of file 538diff --git a/mindspore/lite/src/litert/delegate/nnrt/nnrt_model_kernel.cc b/mindspore/lite/src/litert/delegate/nnrt/nnrt_model_kernel.cc 539index 67443e08..f83632dd 100644 540--- a/mindspore/lite/src/litert/delegate/nnrt/nnrt_model_kernel.cc 541+++ b/mindspore/lite/src/litert/delegate/nnrt/nnrt_model_kernel.cc 542@@ -15,8 +15,33 @@ 543 */ 544 #include <include/errorcode.h> 545 #include "nnrt_model_kernel.h" 546-int mindspore::NNRTModelKernel::Prepare() { return 0; } 547+#include "nnrt_allocator.h" 548+#include "litert/cxx_api/tensor/tensor_impl.h" 549+int mindspore::NNRTModelKernel::Prepare() { 550+ for (size_t i = 0; i < inputs_.size(); i++) { 551+ auto nnrt_allocator = std::make_shared<lite::NNRTAllocator>(oh_nn_executor, i, lite::NNRT_INPUT); 552+ if (nnrt_allocator == nullptr) { 553+ MS_LOG(ERROR) << "Create NNRTAllocator failed"; 554+ return lite::RET_NULL_PTR; 555+ } 556+ inputs_[i].SetAllocator(nnrt_allocator); 557+ } 558+ for (size_t i = 0; i < outputs_.size(); i++) { 559+ auto nnrt_allocator = std::make_shared<lite::NNRTAllocator>(oh_nn_executor, i, lite::NNRT_OUTPUT); 560+ if (nnrt_allocator == nullptr) { 561+ MS_LOG(ERROR) << "Create NNRTAllocator failed"; 562+ return lite::RET_NULL_PTR; 563+ } 564+ outputs_[i].SetAllocator(nnrt_allocator); 565+ } 566+ return lite::RET_OK; 567+} 568+ 569 int mindspore::NNRTModelKernel::Execute() { 570+ MS_CHECK_TRUE_RET(this->outputs().empty() != true, lite::RET_ERROR); 571+ zero_copy_ = this->outputs()[Index0].allocator() != nullptr; 572+ 573+ 574 lite::STATUS ret_val = PrepareInputs(); 575 if (ret_val != lite::RET_OK) { 576 MS_LOG(ERROR) << "NNRTModelKernel PrepareInputs failed, STATUS is " << ret_val; 577@@ -142,9 +167,17 @@ int mindspore::NNRTModelKernel::PrepareInputs() { 578 oprend->dimensions = dimensions_list.data(); 579 oprend->quantParam = quant_param; 580 oprend->type = OH_NN_TENSOR; 581- MS_LOG_INFO << "input tensor: " << tensor.Name() << ", data: " << (void *)tensor.MutableData() << ", size: " << tensor.DataSize(); 582- OH_NN_ReturnCode ret_code = 583- OH_NNExecutor_SetInput(oh_nn_executor, i, oprend, tensor.MutableData(), tensor.DataSize()); 584+ MS_LOG_INFO << "input tensor: " << tensor.Name() << ", data: " << (void *)tensor.MutableData() 585+ << ", size: " << tensor.DataSize(); 586+ 587+ OH_NN_ReturnCode ret_code; 588+ if (zero_copy_) { 589+ OH_NN_Memory mem{tensor.MutableData(), tensor.DataSize()}; 590+ ret_code = OH_NNExecutor_SetInputWithMemory(oh_nn_executor, i, oprend, &mem); 591+ } else { 592+ ret_code = OH_NNExecutor_SetInput(oh_nn_executor, i, oprend, tensor.MutableData(), tensor.DataSize()); 593+ } 594+ 595 delete (oprend); 596 597 if (!tmp_quant_param.empty()) { 598@@ -165,7 +198,14 @@ int mindspore::NNRTModelKernel::TransferOutputs() { 599 auto output_tensors = this->outputs(); 600 for (size_t i = 0; i < output_tensors.size(); i++) { 601 auto tensor = output_tensors[i]; 602- OH_NN_ReturnCode ret_code = OH_NNExecutor_SetOutput(oh_nn_executor, i, tensor.MutableData(), tensor.DataSize()); 603+ 604+ OH_NN_ReturnCode ret_code; 605+ if (zero_copy_) { 606+ OH_NN_Memory mem{tensor.MutableData(), tensor.DataSize()}; 607+ ret_code = OH_NNExecutor_SetOutputWithMemory(oh_nn_executor, i, &mem); 608+ } else { 609+ ret_code = OH_NNExecutor_SetOutput(oh_nn_executor, i, tensor.MutableData(), tensor.DataSize()); 610+ } 611 if (ret_code != OH_NN_SUCCESS) { 612 MS_LOG(ERROR) << "NNExecutor SetOutput failed, current out tensor is" << tensor.Name() 613 << ", OH_NN_ReturnCode = " << ret_code; 614diff --git a/mindspore/lite/src/litert/delegate/nnrt/nnrt_model_kernel.h b/mindspore/lite/src/litert/delegate/nnrt/nnrt_model_kernel.h 615index ea15f7ca..4f2d4f19 100644 616--- a/mindspore/lite/src/litert/delegate/nnrt/nnrt_model_kernel.h 617+++ b/mindspore/lite/src/litert/delegate/nnrt/nnrt_model_kernel.h 618@@ -51,6 +51,9 @@ class NNRTModelKernel : public kernel::Kernel { 619 620 protected: 621 OH_NNExecutor *oh_nn_executor = nullptr; 622+ 623+ private: 624+ bool zero_copy_{false}; 625 }; 626 } // namespace mindspore 627 628diff --git a/mindspore/lite/src/litert/kernel/cpu/nnacl/nnacl_kernel.cc b/mindspore/lite/src/litert/kernel/cpu/nnacl/nnacl_kernel.cc 629index 813a6467..6cedc8c9 100644 630--- a/mindspore/lite/src/litert/kernel/cpu/nnacl/nnacl_kernel.cc 631+++ b/mindspore/lite/src/litert/kernel/cpu/nnacl/nnacl_kernel.cc 632@@ -105,7 +105,7 @@ int NNACLKernel::OptimizeDataCopy() { 633 634 if (input_tensor->allocator() == nullptr || input_tensor->allocator() != output_tensor->allocator() || 635 input_tensor->allocator() != ms_context_->allocator || /* runtime allocator */ 636- op_parameter_->is_train_session_) { 637+ op_parameter_->is_train_session_ || !output_tensor->own_data()) { 638 return NNACLKernel::Run(); 639 } 640 641diff --git a/mindspore/lite/src/litert/mindrt_executor.cc b/mindspore/lite/src/litert/mindrt_executor.cc 642index e5cd720c..5c08cedf 100644 643--- a/mindspore/lite/src/litert/mindrt_executor.cc 644+++ b/mindspore/lite/src/litert/mindrt_executor.cc 645@@ -295,14 +295,22 @@ void MindrtExecutor::FreeOutputTensor() { 646 if (dst_tensor->data_type() == kNumberTypeGLUInt && src_tensor->data_type() == kNumberTypeGLUInt) { 647 continue; 648 } 649- if (dst_tensor->allocator() != nullptr) { 650+ 651+ if ((dst_tensor->allocator() != nullptr && dst_tensor->own_data()) || dst_tensor->data() == nullptr) { 652+ MS_LOG(DEBUG) << "free data"; 653 dst_tensor->FreeData(); 654- } else { 655- if (dst_tensor->data_type() == src_tensor->data_type()) { 656+ } else if (dst_tensor->data() != nullptr && dst_tensor->data_type() == src_tensor->data_type()) { 657+ if (dst_tensor->allocator() == nullptr) { 658 /* user set graph-output-tensor from outside */ 659+ MS_LOG(DEBUG) << "user set graph-output-tensor from outside"; 660 src_tensor->set_data(dst_tensor->data()); 661 src_tensor->set_own_data(false); 662 src_tensor->set_allocator(nullptr); 663+ } else if (dst_tensor->allocator() == src_tensor->allocator()) { 664+ /* nnrt npu zero copy scene */ 665+ MS_LOG(DEBUG) << "zero copy data"; 666+ src_tensor->set_data(dst_tensor->data()); 667+ src_tensor->set_own_data(dst_tensor->own_data()); 668 } 669 } 670 } 671-- 6722.25.1 673 674