1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_ 18 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_ 19 20 #include <vector> 21 #include <memory> 22 #include <string> 23 #include <set> 24 #include <unordered_map> 25 #include "runtime/framework/actor/actor_common.h" 26 #include "runtime/framework/device_tensor_store.h" 27 #include "runtime/hardware/device_context.h" 28 29 namespace mindspore { 30 namespace runtime { 31 using mindspore::device::DeviceContext; 32 33 // MemoryManagerActor need response to memory alloc and free quickly, so must bind single thread. 34 class MemoryManagerActor : public ActorBase { 35 public: MemoryManagerActor()36 MemoryManagerActor() : ActorBase("MemoryManagerActor") {} 37 ~MemoryManagerActor() override = default; 38 39 // The process entry of memory alloc. 40 void AllocateMemory(const std::vector<DeviceTensor *> *alloc_list, const DeviceContext *device_context, 41 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 42 // The process entry of continuous memory alloc, the size of alloc_list_list, size_list_list, total_size_list and 43 // device_contexts must be equal. 44 void AllocateContinuousMemory(const std::vector<std::vector<DeviceTensorPtr>> *alloc_list_list, 45 const std::vector<std::vector<size_t>> *size_list_list, 46 const std::vector<size_t> *total_size_list, 47 const std::vector<const DeviceContext *> *device_contexts, 48 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 49 // device_contexts is from different device, the size of device_contexts must be equal to the alloc_list. 50 void AllocateBatchMemory(const std::vector<DeviceTensor *> *alloc_list, 51 const std::vector<const DeviceContext *> *device_contexts, 52 OpContext<DeviceTensor> *const op_context, const AID &from_aid); 53 54 // The process entry of memory free. 55 void FreeMemory(const std::vector<DeviceTensor *> *free_list, const DeviceContext *device_context, 56 OpContext<DeviceTensor> *const op_context); 57 // device_contexts is from different device, the size of device_contexts must be equal to the free_list. 58 void FreeBatchMemory(const std::vector<DeviceTensor *> *free_list, 59 const std::vector<const DeviceContext *> *device_contexts, 60 OpContext<DeviceTensor> *const op_context); 61 62 // Wait the MemoryManagerActor to finish running all current messages. 63 void Wait(OpContext<DeviceTensor> *const op_context, const AID &from_aid); 64 65 private: 66 // When allocate device memory fail, print error log and set op context failed status. 67 void SetOpContextMemoryAllocFail(const std::string &kernel_name, const DeviceContext *device_context, 68 size_t alloc_size, OpContext<DeviceTensor> *const op_context); 69 70 // MemoryManagerActor object is used like a single instance, if one actor allocates memory failed in one batch, which 71 // will set fail message info OpContext, major thread will destroy the OpContext object, subsequent actor can not set 72 // fail message again, so we record allocating memory fail event by the uuid of the batch, which is key of the set. 73 std::set<int> mem_alloc_failed_step_ids_; 74 }; 75 } // namespace runtime 76 } // namespace mindspore 77 78 #endif // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_MEMORY_MANAGER_ACTOR_H_ 79