• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "runtime/framework/actor/memory_manager_actor.h"
18 #include "runtime/framework/actor/data_source_actor.h"
19 #include "runtime/framework/actor/kernel_actor.h"
20 #include "mindrt/include/async/async.h"
21 #include "utils/log_adapter.h"
22 
23 namespace mindspore {
24 namespace runtime {
AllocateMemory(const std::vector<DeviceTensor * > * alloc_list,const DeviceContext * device_context,OpContext<DeviceTensor> * const op_context,const AID & from_aid)25 void MemoryManagerActor::AllocateMemory(const std::vector<DeviceTensor *> *alloc_list,
26                                         const DeviceContext *device_context, OpContext<DeviceTensor> *const op_context,
27                                         const AID &from_aid) {
28   MS_EXCEPTION_IF_NULL(alloc_list);
29   MS_EXCEPTION_IF_NULL(device_context);
30   MS_EXCEPTION_IF_NULL(op_context);
31 
32   for (auto &device_tensor : *alloc_list) {
33     MS_EXCEPTION_IF_NULL(device_tensor);
34     if (device_tensor->GetPtr() != nullptr) {
35       continue;
36     }
37     // Allocate memory through the device context.
38     if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
39       SetOpContextMemoryAllocFail(from_aid.Name(), device_context, device_tensor->GetSize(), op_context);
40       return;
41     }
42   }
43 
44   // Call back to the from actor to process after memory allocation finished.
45   Async(from_aid, &MemoryAwareActor::OnMemoryAllocFinish, op_context);
46 }
47 
AllocateContinuousMemory(const std::vector<std::vector<DeviceTensorPtr>> * alloc_list_list,const std::vector<std::vector<size_t>> * size_list_list,const std::vector<size_t> * total_size_list,const std::vector<const DeviceContext * > * device_contexts,OpContext<DeviceTensor> * const op_context,const AID & from_aid)48 void MemoryManagerActor::AllocateContinuousMemory(const std::vector<std::vector<DeviceTensorPtr>> *alloc_list_list,
49                                                   const std::vector<std::vector<size_t>> *size_list_list,
50                                                   const std::vector<size_t> *total_size_list,
51                                                   const std::vector<const DeviceContext *> *device_contexts,
52                                                   OpContext<DeviceTensor> *const op_context, const AID &from_aid) {
53   MS_EXCEPTION_IF_NULL(alloc_list_list);
54   MS_EXCEPTION_IF_NULL(size_list_list);
55   MS_EXCEPTION_IF_NULL(total_size_list);
56   MS_EXCEPTION_IF_NULL(device_contexts);
57   MS_EXCEPTION_IF_NULL(op_context);
58   if (((*alloc_list_list).size() != (*size_list_list).size()) ||
59       ((*size_list_list).size() != (*total_size_list).size()) ||
60       ((*total_size_list).size() != (*device_contexts).size())) {
61     SET_OPCONTEXT_FAIL_RET_WITH_ERROR(
62       (*op_context), "The size of alloc_list_list, size_list_list, total_size_list and device_contexts are not equal.");
63   }
64 
65   for (size_t i = 0; i < (*alloc_list_list).size(); ++i) {
66     auto &alloc_list = (*alloc_list_list)[i];
67     auto &size_list = (*size_list_list)[i];
68     auto &total_size = (*total_size_list)[i];
69     auto &device_context = (*device_contexts)[i];
70     MS_EXCEPTION_IF_NULL(device_context);
71     // Allocate memory through the device context.
72     if (!device_context->AllocateContinuousMemory(alloc_list, total_size, size_list)) {
73       SetOpContextMemoryAllocFail(from_aid.Name(), device_context, total_size, op_context);
74       return;
75     }
76   }
77 
78   // Call back to the from actor to process after memory allocation finished.
79   Async(from_aid, &MemoryAwareActor::OnMemoryAllocFinish, op_context);
80 }
81 
AllocateBatchMemory(const std::vector<DeviceTensor * > * alloc_list,const std::vector<const DeviceContext * > * device_contexts,OpContext<DeviceTensor> * const op_context,const AID & from_aid)82 void MemoryManagerActor::AllocateBatchMemory(const std::vector<DeviceTensor *> *alloc_list,
83                                              const std::vector<const DeviceContext *> *device_contexts,
84                                              OpContext<DeviceTensor> *const op_context, const AID &from_aid) {
85   MS_EXCEPTION_IF_NULL(alloc_list);
86   MS_EXCEPTION_IF_NULL(device_contexts);
87   MS_EXCEPTION_IF_NULL(op_context);
88   if ((*alloc_list).size() != (*device_contexts).size()) {
89     SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context),
90                                       "The size of alloc list is not equal to the size of device contexts.");
91   }
92 
93   for (size_t i = 0; i < (*alloc_list).size(); ++i) {
94     auto &device_tensor = (*alloc_list)[i];
95     auto &device_context = (*device_contexts)[i];
96     MS_EXCEPTION_IF_NULL(device_tensor);
97     MS_EXCEPTION_IF_NULL(device_context);
98     if (device_tensor->GetPtr() != nullptr) {
99       continue;
100     }
101 
102     // Allocate memory through the device context.
103     if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
104       SetOpContextMemoryAllocFail(from_aid.Name(), device_context, device_tensor->GetSize(), op_context);
105       return;
106     }
107   }
108 
109   // Call back to the from actor to process after memory allocation finished.
110   Async(from_aid, &MemoryAwareActor::OnMemoryAllocFinish, op_context);
111 }
112 
FreeMemory(const std::vector<DeviceTensor * > * free_list,const DeviceContext * device_context,OpContext<DeviceTensor> *)113 void MemoryManagerActor::FreeMemory(const std::vector<DeviceTensor *> *free_list, const DeviceContext *device_context,
114                                     OpContext<DeviceTensor> *) {
115   MS_EXCEPTION_IF_NULL(free_list);
116   MS_EXCEPTION_IF_NULL(device_context);
117   for (auto &device_tensor : *free_list) {
118     MS_EXCEPTION_IF_NULL(device_tensor);
119     if (device_tensor->original_ref_count() == SIZE_MAX) {
120       continue;
121     }
122     // The reference count is decremented to zero to free memory, and reset to the original count.
123     device_tensor->DecreaseRefCount();
124     if (device_tensor->ref_count() == 0) {
125       // Free memory through the device context.
126       if (device_tensor->GetPtr() != nullptr) {
127         device_context->FreeMemory(device_tensor);
128       }
129       device_tensor->ResetRefCount();
130     }
131   }
132 }
133 
FreeBatchMemory(const std::vector<DeviceTensor * > * free_list,const std::vector<const DeviceContext * > * device_contexts,OpContext<DeviceTensor> * const op_context)134 void MemoryManagerActor::FreeBatchMemory(const std::vector<DeviceTensor *> *free_list,
135                                          const std::vector<const DeviceContext *> *device_contexts,
136                                          OpContext<DeviceTensor> *const op_context) {
137   MS_EXCEPTION_IF_NULL(free_list);
138   MS_EXCEPTION_IF_NULL(device_contexts);
139   MS_EXCEPTION_IF_NULL(op_context);
140   if ((*free_list).size() != (*device_contexts).size()) {
141     SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context),
142                                       "The size of free list is not equal to the size of device contexts.");
143   }
144 
145   for (size_t i = 0; i < (*free_list).size(); ++i) {
146     auto &device_tensor = (*free_list)[i];
147     auto &device_context = (*device_contexts)[i];
148     MS_EXCEPTION_IF_NULL(device_tensor);
149     MS_EXCEPTION_IF_NULL(device_context);
150     if (device_tensor->original_ref_count() == SIZE_MAX) {
151       continue;
152     }
153     // The reference count is decremented to zero to free memory, and reset to the original count.
154     device_tensor->DecreaseRefCount();
155     if (device_tensor->ref_count() == 0) {
156       // Free memory through the device context.
157       if (device_tensor->GetPtr() != nullptr) {
158         device_context->FreeMemory(device_tensor);
159       }
160       device_tensor->ResetRefCount();
161     }
162   }
163 }
164 
Wait(OpContext<DeviceTensor> * const op_context,const AID & from_aid)165 void MemoryManagerActor::Wait(OpContext<DeviceTensor> *const op_context, const AID &from_aid) {
166   // Call back to the from actor to process.
167   Async(from_aid, &MemoryAwareActor::OnMemoryAllocFinish, op_context);
168 }
169 
SetOpContextMemoryAllocFail(const std::string & kernel_name,const DeviceContext * device_context,size_t alloc_size,OpContext<DeviceTensor> * const op_context)170 void MemoryManagerActor::SetOpContextMemoryAllocFail(const std::string &kernel_name,
171                                                      const DeviceContext *device_context, size_t alloc_size,
172                                                      OpContext<DeviceTensor> *const op_context) {
173   MS_EXCEPTION_IF_NULL(device_context);
174   MS_EXCEPTION_IF_NULL(op_context);
175 
176   int step_id = op_context->sequential_num_;
177   // First occur allocating memory failed.
178   if (mem_alloc_failed_step_ids_.find(step_id) == mem_alloc_failed_step_ids_.end()) {
179     mem_alloc_failed_step_ids_.clear();
180     (void)mem_alloc_failed_step_ids_.insert(step_id);
181     SET_OPCONTEXT_MEMORY_ALLOC_FAIL_BY_STRATEGY(GraphExecutionStrategy::kPipeline, *op_context, *device_context,
182                                                 kernel_name, alloc_size);
183   }
184 }
185 }  // namespace runtime
186 }  // namespace mindspore
187