1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_ 18 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_ 19 20 #include <vector> 21 #include <memory> 22 #include <string> 23 #include <set> 24 #include <mutex> 25 #include "utils/hash_map.h" 26 #include "runtime/graph_scheduler/actor/actor_common.h" 27 #include "runtime/graph_scheduler/device_tensor_store.h" 28 #include "runtime/hardware/device_context.h" 29 30 namespace mindspore { 31 namespace runtime { 32 using mindspore::device::DeviceContext; 33 using mindspore::session::SomasInfo; 34 35 // MemoryManagerActor need response to memory alloc and free quickly, so must bind single thread. 36 class MemoryManagerActor : public ActorBase { 37 public: GetInstance()38 static std::shared_ptr<MemoryManagerActor> &GetInstance() { 39 static std::shared_ptr<MemoryManagerActor> instance = std::shared_ptr<MemoryManagerActor>(new MemoryManagerActor()); 40 return instance; 41 } 42 ~MemoryManagerActor() override = default; 43 44 // The process entry of memory alloc. 45 void AllocateMemory(const std::vector<DeviceTensor *> *alloc_list, const DeviceContext *device_context, 46 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 47 // The process entry of continuous memory alloc, the size of alloc_list_list, size_list_list, total_size_list and 48 // device_contexts must be equal. 49 void AllocateContinuousMemory(const std::vector<std::vector<DeviceTensorPtr>> *alloc_list_list, 50 const std::vector<std::vector<size_t>> *size_list_list, 51 const std::vector<uint32_t> *stream_id_list, const std::vector<size_t> *total_size_list, 52 const std::vector<const DeviceContext *> *device_contexts, 53 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 54 // device_contexts is from different device, the size of device_contexts must be equal to the alloc_list. 55 void AllocateBatchMemory(const std::vector<DeviceTensor *> *alloc_list, 56 const std::vector<const DeviceContext *> *device_contexts, 57 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 58 // The process entry of somas memory alloc. 59 void AllocateSomasMemory(SomasInfo *const somas_info, const DeviceContext *device_context, 60 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 61 62 // The process entry of memory free. 63 void FreeMemory(const std::vector<DeviceTensor *> *free_list, const DeviceContext *device_context, 64 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 65 // device_contexts is from different device, the size of device_contexts must be equal to the free_list. 66 void FreeBatchMemory(const std::vector<DeviceTensor *> *free_list, 67 const std::vector<const DeviceContext *> *device_contexts, 68 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 69 // The process entry of somas memory free. 70 void FreeSomasMemory(SomasInfo *const somas_info, const DeviceContext *device_context, 71 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 72 73 // Wait the MemoryManagerActor to finish running all current messages. 74 void Wait(OpContext<DeviceTensor> *const op_context, const AID &from_aid); 75 76 private: MemoryManagerActor()77 MemoryManagerActor() : ActorBase("MemoryManagerActor") {} 78 DISABLE_COPY_AND_ASSIGN(MemoryManagerActor); 79 80 void FreeMemoryByRefCount(DeviceTensor *const device_tensor, const DeviceContext *device_context, 81 const std::string &op_name); 82 83 // When allocate device memory fail, print error log and set op context failed status. 84 void SetOpContextMemoryAllocFail(const std::string &kernel_name, const DeviceContext *device_context, 85 size_t alloc_size, OpContext<DeviceTensor> *const op_context); 86 87 // MemoryManagerActor object is used like a single instance, if one actor allocates memory failed in one batch, which 88 // will set fail message info OpContext, major thread will destroy the OpContext object, subsequent actor can not set 89 // fail message again, so we record allocating memory fail event by the uuid of the batch, which is key of the set. 90 std::set<int> mem_alloc_failed_step_ids_; 91 std::mutex mem_alloc_failed_mutex_; 92 }; 93 } // namespace runtime 94 } // namespace mindspore 95 96 #endif // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_ 97