1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "runtime/framework/actor/memory_manager_actor.h"
18 #include "runtime/framework/actor/data_source_actor.h"
19 #include "runtime/framework/actor/kernel_actor.h"
20 #include "mindrt/include/async/async.h"
21 #include "utils/log_adapter.h"
22
23 namespace mindspore {
24 namespace runtime {
AllocateMemory(const std::vector<DeviceTensor * > * alloc_list,const DeviceContext * device_context,OpContext<DeviceTensor> * const op_context,const AID & from_aid)25 void MemoryManagerActor::AllocateMemory(const std::vector<DeviceTensor *> *alloc_list,
26 const DeviceContext *device_context, OpContext<DeviceTensor> *const op_context,
27 const AID &from_aid) {
28 MS_EXCEPTION_IF_NULL(alloc_list);
29 MS_EXCEPTION_IF_NULL(device_context);
30 MS_EXCEPTION_IF_NULL(op_context);
31
32 for (auto &device_tensor : *alloc_list) {
33 MS_EXCEPTION_IF_NULL(device_tensor);
34 if (device_tensor->GetPtr() != nullptr) {
35 continue;
36 }
37 // Allocate memory through the device context.
38 if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
39 SetOpContextMemoryAllocFail(from_aid.Name(), device_context, device_tensor->GetSize(), op_context);
40 return;
41 }
42 }
43
44 // Call back to the from actor to process after memory allocation finished.
45 Async(from_aid, &MemoryAwareActor::OnMemoryAllocFinish, op_context);
46 }
47
AllocateContinuousMemory(const std::vector<std::vector<DeviceTensorPtr>> * alloc_list_list,const std::vector<std::vector<size_t>> * size_list_list,const std::vector<size_t> * total_size_list,const std::vector<const DeviceContext * > * device_contexts,OpContext<DeviceTensor> * const op_context,const AID & from_aid)48 void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<DeviceTensorPtr>> *alloc_list_list,
49 const std::vector<std::vector<size_t>> *size_list_list,
50 const std::vector<size_t> *total_size_list,
51 const std::vector<const DeviceContext *> *device_contexts,
52 OpContext<DeviceTensor> *const op_context, const AID &from_aid) {
53 MS_EXCEPTION_IF_NULL(alloc_list_list);
54 MS_EXCEPTION_IF_NULL(size_list_list);
55 MS_EXCEPTION_IF_NULL(total_size_list);
56 MS_EXCEPTION_IF_NULL(device_contexts);
57 MS_EXCEPTION_IF_NULL(op_context);
58 if (((*alloc_list_list).size() != (*size_list_list).size()) ||
59 ((*size_list_list).size() != (*total_size_list).size()) ||
60 ((*total_size_list).size() != (*device_contexts).size())) {
61 SET_OPCONTEXT_FAIL_RET_WITH_ERROR(
62 (*op_context), "The size of alloc_list_list, size_list_list, total_size_list and device_contexts are not equal.");
63 }
64
65 for (size_t i = 0; i < (*alloc_list_list).size(); ++i) {
66 auto &alloc_list = (*alloc_list_list)[i];
67 auto &size_list = (*size_list_list)[i];
68 auto &total_size = (*total_size_list)[i];
69 auto &device_context = (*device_contexts)[i];
70 MS_EXCEPTION_IF_NULL(device_context);
71 // Allocate memory through the device context.
72 if (!device_context->AllocateContinuousMemory(alloc_list, total_size, size_list)) {
73 SetOpContextMemoryAllocFail(from_aid.Name(), device_context, total_size, op_context);
74 return;
75 }
76 }
77
78 // Call back to the from actor to process after memory allocation finished.
79 Async(from_aid, &MemoryAwareActor::OnMemoryAllocFinish, op_context);
80 }
81
AllocateBatchMemory(const std::vector<DeviceTensor * > * alloc_list,const std::vector<const DeviceContext * > * device_contexts,OpContext<DeviceTensor> * const op_context,const AID & from_aid)82 void MemoryManagerActor::AllocateBatchMemory(const std::vector<DeviceTensor *> *alloc_list,
83 const std::vector<const DeviceContext *> *device_contexts,
84 OpContext<DeviceTensor> *const op_context, const AID &from_aid) {
85 MS_EXCEPTION_IF_NULL(alloc_list);
86 MS_EXCEPTION_IF_NULL(device_contexts);
87 MS_EXCEPTION_IF_NULL(op_context);
88 if ((*alloc_list).size() != (*device_contexts).size()) {
89 SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context),
90 "The size of alloc list is not equal to the size of device contexts.");
91 }
92
93 for (size_t i = 0; i < (*alloc_list).size(); ++i) {
94 auto &device_tensor = (*alloc_list)[i];
95 auto &device_context = (*device_contexts)[i];
96 MS_EXCEPTION_IF_NULL(device_tensor);
97 MS_EXCEPTION_IF_NULL(device_context);
98 if (device_tensor->GetPtr() != nullptr) {
99 continue;
100 }
101
102 // Allocate memory through the device context.
103 if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
104 SetOpContextMemoryAllocFail(from_aid.Name(), device_context, device_tensor->GetSize(), op_context);
105 return;
106 }
107 }
108
109 // Call back to the from actor to process after memory allocation finished.
110 Async(from_aid, &MemoryAwareActor::OnMemoryAllocFinish, op_context);
111 }
112
FreeMemory(const std::vector<DeviceTensor * > * free_list,const DeviceContext * device_context,OpContext<DeviceTensor> *)113 void MemoryManagerActor::FreeMemory(const std::vector<DeviceTensor *> *free_list, const DeviceContext *device_context,
114 OpContext<DeviceTensor> *) {
115 MS_EXCEPTION_IF_NULL(free_list);
116 MS_EXCEPTION_IF_NULL(device_context);
117 for (auto &device_tensor : *free_list) {
118 MS_EXCEPTION_IF_NULL(device_tensor);
119 if (device_tensor->original_ref_count() == SIZE_MAX) {
120 continue;
121 }
122 // The reference count is decremented to zero to free memory, and reset to the original count.
123 device_tensor->DecreaseRefCount();
124 if (device_tensor->ref_count() == 0) {
125 // Free memory through the device context.
126 if (device_tensor->GetPtr() != nullptr) {
127 device_context->FreeMemory(device_tensor);
128 }
129 device_tensor->ResetRefCount();
130 }
131 }
132 }
133
FreeBatchMemory(const std::vector<DeviceTensor * > * free_list,const std::vector<const DeviceContext * > * device_contexts,OpContext<DeviceTensor> * const op_context)134 void MemoryManagerActor::FreeBatchMemory(const std::vector<DeviceTensor *> *free_list,
135 const std::vector<const DeviceContext *> *device_contexts,
136 OpContext<DeviceTensor> *const op_context) {
137 MS_EXCEPTION_IF_NULL(free_list);
138 MS_EXCEPTION_IF_NULL(device_contexts);
139 MS_EXCEPTION_IF_NULL(op_context);
140 if ((*free_list).size() != (*device_contexts).size()) {
141 SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context),
142 "The size of free list is not equal to the size of device contexts.");
143 }
144
145 for (size_t i = 0; i < (*free_list).size(); ++i) {
146 auto &device_tensor = (*free_list)[i];
147 auto &device_context = (*device_contexts)[i];
148 MS_EXCEPTION_IF_NULL(device_tensor);
149 MS_EXCEPTION_IF_NULL(device_context);
150 if (device_tensor->original_ref_count() == SIZE_MAX) {
151 continue;
152 }
153 // The reference count is decremented to zero to free memory, and reset to the original count.
154 device_tensor->DecreaseRefCount();
155 if (device_tensor->ref_count() == 0) {
156 // Free memory through the device context.
157 if (device_tensor->GetPtr() != nullptr) {
158 device_context->FreeMemory(device_tensor);
159 }
160 device_tensor->ResetRefCount();
161 }
162 }
163 }
164
Wait(OpContext<DeviceTensor> * const op_context,const AID & from_aid)165 void MemoryManagerActor::Wait(OpContext<DeviceTensor> *const op_context, const AID &from_aid) {
166 // Call back to the from actor to process.
167 Async(from_aid, &MemoryAwareActor::OnMemoryAllocFinish, op_context);
168 }
169
SetOpContextMemoryAllocFail(const std::string & kernel_name,const DeviceContext * device_context,size_t alloc_size,OpContext<DeviceTensor> * const op_context)170 void MemoryManagerActor::SetOpContextMemoryAllocFail(const std::string &kernel_name,
171 const DeviceContext *device_context, size_t alloc_size,
172 OpContext<DeviceTensor> *const op_context) {
173 MS_EXCEPTION_IF_NULL(device_context);
174 MS_EXCEPTION_IF_NULL(op_context);
175
176 int step_id = op_context->sequential_num_;
177 // First occur allocating memory failed.
178 if (mem_alloc_failed_step_ids_.find(step_id) == mem_alloc_failed_step_ids_.end()) {
179 mem_alloc_failed_step_ids_.clear();
180 (void)mem_alloc_failed_step_ids_.insert(step_id);
181 SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, *op_context, *device_context,
182 kernel_name, alloc_size);
183 }
184 }
185 } // namespace runtime
186 } // namespace mindspore
187