• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_
18 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_
19 
20 #include <vector>
21 #include <memory>
22 #include <string>
23 #include <set>
24 #include <mutex>
25 #include "utils/hash_map.h"
26 #include "runtime/graph_scheduler/actor/actor_common.h"
27 #include "runtime/graph_scheduler/device_tensor_store.h"
28 #include "runtime/hardware/device_context.h"
29 
30 namespace mindspore {
31 namespace runtime {
32 using mindspore::device::DeviceContext;
33 using mindspore::session::SomasInfo;
34 
35 // MemoryManagerActor need response to memory alloc and free quickly, so must bind single thread.
36 class MemoryManagerActor : public ActorBase {
37  public:
GetInstance()38   static std::shared_ptr<MemoryManagerActor> &GetInstance() {
39     static std::shared_ptr<MemoryManagerActor> instance = std::shared_ptr<MemoryManagerActor>(new MemoryManagerActor());
40     return instance;
41   }
42   ~MemoryManagerActor() override = default;
43 
44   // The process entry of memory alloc.
45   void AllocateMemory(const std::vector<DeviceTensor *> *alloc_list, const DeviceContext *device_context,
46                       OpContext<DeviceTensor> *const op_context, const AID &from_aid);
47   // The process entry of continuous memory alloc, the size of alloc_list_list, size_list_list, total_size_list and
48   // device_contexts must be equal.
49   void AllocateContinuousMemory(const std::vector<std::vector<DeviceTensorPtr>> *alloc_list_list,
50                                 const std::vector<std::vector<size_t>> *size_list_list,
51                                 const std::vector<uint32_t> *stream_id_list, const std::vector<size_t> *total_size_list,
52                                 const std::vector<const DeviceContext *> *device_contexts,
53                                 OpContext<DeviceTensor> *const op_context, const AID &from_aid);
54   // device_contexts is from different device, the size of device_contexts must be equal to the alloc_list.
55   void AllocateBatchMemory(const std::vector<DeviceTensor *> *alloc_list,
56                            const std::vector<const DeviceContext *> *device_contexts,
57                            OpContext<DeviceTensor> *const op_context, const AID &from_aid);
58   // The process entry of somas memory alloc.
59   void AllocateSomasMemory(SomasInfo *const somas_info, const DeviceContext *device_context,
60                            OpContext<DeviceTensor> *const op_context, const AID &from_aid);
61 
62   // The process entry of memory free.
63   void FreeMemory(const std::vector<DeviceTensor *> *free_list, const DeviceContext *device_context,
64                   OpContext<DeviceTensor> *const op_context, const AID &from_aid);
65   // device_contexts is from different device, the size of device_contexts must be equal to the free_list.
66   void FreeBatchMemory(const std::vector<DeviceTensor *> *free_list,
67                        const std::vector<const DeviceContext *> *device_contexts,
68                        OpContext<DeviceTensor> *const op_context, const AID &from_aid);
69   // The process entry of somas memory free.
70   void FreeSomasMemory(SomasInfo *const somas_info, const DeviceContext *device_context,
71                        OpContext<DeviceTensor> *const op_context, const AID &from_aid);
72 
73   // Wait the MemoryManagerActor to finish running all current messages.
74   void Wait(OpContext<DeviceTensor> *const op_context, const AID &from_aid);
75 
76  private:
MemoryManagerActor()77   MemoryManagerActor() : ActorBase("MemoryManagerActor") {}
78   DISABLE_COPY_AND_ASSIGN(MemoryManagerActor);
79 
80   void FreeMemoryByRefCount(DeviceTensor *const device_tensor, const DeviceContext *device_context,
81                             const std::string &op_name);
82 
83   // When allocate device memory fail, print error log and set op context failed status.
84   void SetOpContextMemoryAllocFail(const std::string &kernel_name, const DeviceContext *device_context,
85                                    size_t alloc_size, OpContext<DeviceTensor> *const op_context);
86 
87   // MemoryManagerActor object is used like a single instance, if one actor allocates memory failed in one batch, which
88   // will set fail message info OpContext, major thread will destroy the OpContext object, subsequent actor can not set
89   // fail message again, so we record allocating memory fail event by the uuid of the batch, which is key of the set.
90   std::set<int> mem_alloc_failed_step_ids_;
91   std::mutex mem_alloc_failed_mutex_;
92 };
93 }  // namespace runtime
94 }  // namespace mindspore
95 
96 #endif  // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_
97