• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_
18 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_
19 
20 #include <vector>
21 #include <memory>
22 #include <string>
23 #include <set>
24 #include <unordered_map>
25 #include "runtime/framework/actor/actor_common.h"
26 #include "runtime/framework/device_tensor_store.h"
27 #include "runtime/hardware/device_context.h"
28 
29 namespace mindspore {
30 namespace runtime {
31 using mindspore::device::DeviceContext;
32 
33 // MemoryManagerActor need response to memory alloc and free quickly, so must bind single thread.
34 class MemoryManagerActor : public ActorBase {
35  public:
MemoryManagerActor()36   MemoryManagerActor() : ActorBase("MemoryManagerActor") {}
37   ~MemoryManagerActor() override = default;
38 
39   // The process entry of memory alloc.
40   void AllocateMemory(const std::vector<DeviceTensor *> *alloc_list, const DeviceContext *device_context,
41                       OpContext<DeviceTensor> *const op_context, const AID &from_aid);
42   // The process entry of continuous memory alloc, the size of alloc_list_list, size_list_list, total_size_list and
43   // device_contexts must be equal.
44   void AllocateContinuousMemory(const std::vector<std::vector<DeviceTensorPtr>> *alloc_list_list,
45                                 const std::vector<std::vector<size_t>> *size_list_list,
46                                 const std::vector<size_t> *total_size_list,
47                                 const std::vector<const DeviceContext *> *device_contexts,
48                                 OpContext<DeviceTensor> *const op_context, const AID &from_aid);
49   // device_contexts is from different device, the size of device_contexts must be equal to the alloc_list.
50   void AllocateBatchMemory(const std::vector<DeviceTensor *> *alloc_list,
51                            const std::vector<const DeviceContext *> *device_contexts,
52                            OpContext<DeviceTensor> *const op_context, const AID &from_aid);
53 
54   // The process entry of memory free.
55   void FreeMemory(const std::vector<DeviceTensor *> *free_list, const DeviceContext *device_context,
56                   OpContext<DeviceTensor> *const op_context);
57   // device_contexts is from different device, the size of device_contexts must be equal to the free_list.
58   void FreeBatchMemory(const std::vector<DeviceTensor *> *free_list,
59                        const std::vector<const DeviceContext *> *device_contexts,
60                        OpContext<DeviceTensor> *const op_context);
61 
62   // Wait the MemoryManagerActor to finish running all current messages.
63   void Wait(OpContext<DeviceTensor> *const op_context, const AID &from_aid);
64 
65  private:
66   // When allocate device memory fail, print error log and set op context failed status.
67   void SetOpContextMemoryAllocFail(const std::string &kernel_name, const DeviceContext *device_context,
68                                    size_t alloc_size, OpContext<DeviceTensor> *const op_context);
69 
70   // MemoryManagerActor object is used like a single instance, if one actor allocates memory failed in one batch, which
71   // will set fail message info OpContext, major thread will destroy the OpContext object, subsequent actor can not set
72   // fail message again, so we record allocating memory fail event by the uuid of the batch, which is key of the set.
73   std::set<int> mem_alloc_failed_step_ids_;
74 };
75 }  // namespace runtime
76 }  // namespace mindspore
77 
78 #endif  // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_
79