1 /** 2 * Copyright 2019-2023 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_DYNAMIC_ALLOCATOR_H_ 18 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_DYNAMIC_ALLOCATOR_H_ 19 20 #include <algorithm> 21 #include <functional> 22 #include <list> 23 #include <map> 24 #include <memory> 25 #include <mutex> 26 #include <set> 27 #include <thread> 28 #include <unordered_map> 29 #include <utility> 30 #include <vector> 31 #include <random> 32 #include <string> 33 #include <tuple> 34 35 #include "utils/ms_utils.h" 36 #include "include/backend/visible.h" 37 #include "include/common/utils/stream_util.h" 38 #include "ir/device_event.h" 39 #ifdef __APPLE__ 40 #include "mindrt/include/async/spinlock.h" 41 #endif 42 43 namespace mindspore { 44 namespace device { 45 // The status of memory buf. 46 enum class DynamicMemBufStatus : int { kMemBufIdle, kMemBufUsed, kMemBufEagerFree, kMemBufUsedByEvent }; 47 // Memory allocator type is used to record the memory classification statistics information. 48 enum class AllocatorType : int { kWeight, kConstantValue, kKernelOutput, kGraphOutput, kWorkspace, kOther }; 49 constexpr int kShiftOffset = 2; 50 constexpr int kAllocatorTypeNum = 6; 51 // Alloc memory aligned according to 512 bytes. 52 constexpr size_t kDynamicMemAlignSize = 512; 53 // The minimum unit size (1G) of memory block used for dynamic extend. 54 constexpr size_t kDynamicMemAllocUnitSize = 1024 << 20; 55 56 // The Comparator of device address from small to large. 57 using DeviceMemPtr = void(*); 58 struct DeviceAddrCmp { operatorDeviceAddrCmp59 bool operator()(const DeviceMemPtr &addr1, const DeviceMemPtr &addr2) const { return addr1 < addr2; } 60 }; 61 62 // The AllocatorDebugInfo wrapper which is the local thread for the dynamic memory pool. 63 class DynamicMemAllocatorDebugInfo; 64 // Memory buf is the smallest operation object of dynamic memory pool. 65 struct DynamicMemBuf; 66 using DynamicMemBufPtr = std::shared_ptr<DynamicMemBuf>; 67 // Multimap key is the tensor size, for finding the idle memory buf by tensor size. 68 using SizeMapMemBuf = std::multimap<size_t, DynamicMemBufPtr>; 69 // Map key is the device address, for finding the used memory buf in memory block by device address. 70 using DeviceAddrMapMemBuf = std::map<DeviceMemPtr, DynamicMemBufPtr, DeviceAddrCmp>; 71 // Memory block is composed of memory buf. 72 class DynamicMemBlock; 73 using DynamicMemBlockPtr = std::shared_ptr<DynamicMemBlock>; 74 75 struct MemStatusManager; 76 using MemStatusManagerPtr = std::shared_ptr<MemStatusManager>; 77 78 // pair has no hash method, need override it. 79 struct pair_hash { 80 template <class L, class R> operatorpair_hash81 std::size_t operator()(const std::pair<L, R> ¶m) const { 82 size_t hash = std::hash<L>{}(param.first); 83 hash <<= (sizeof(size_t) << kShiftOffset); 84 hash ^= std::hash<R>{}(param.second); 85 return std::hash<size_t>{}(hash); 86 } 87 }; 88 89 // The main class of dynamic memory pool. 90 class BACKEND_EXPORT DynamicMemPoolBestFit { 91 public: DynamicMemPoolBestFit()92 DynamicMemPoolBestFit() 93 : persistent_mem_(std::make_shared<MemStatusManager>()), common_mem_(std::make_shared<MemStatusManager>()) {} 94 virtual ~DynamicMemPoolBestFit(); 95 96 // The main program entry of memory alloc. 97 DeviceMemPtr AllocTensorMem(size_t size, bool from_persistent_mem = false, bool need_recycle = false, 98 uint32_t stream_id = kDefaultStreamIndex); 99 // The main program entry of continuous memory alloc. 100 std::vector<DeviceMemPtr> AllocContinuousTensorMem(const std::vector<size_t> &size_list, 101 uint32_t stream_id = kDefaultStreamIndex); 102 // The main program entry of memory free. 103 void FreeTensorMem(const DeviceMemPtr &device_addr); 104 // The main program entry of part memorys free and part memorys keep. 105 void FreePartTensorMems(const std::vector<DeviceMemPtr> &free_addrs, const std::vector<DeviceMemPtr> &keep_addrs, 106 const std::vector<size_t> &keep_addr_sizes); 107 108 // Release the real device memory. 109 void ReleaseDeviceRes(); 110 111 // Get the minimum memory unit size using for dynamic extend. 112 size_t MemAllocUnitSize(bool from_persistent_mem = false) const; 113 // Set the minimum memory unit size using for dynamic extend. 114 void SetMemAllocUintSize(size_t common_size, size_t persist_size = kDynamicMemAllocUnitSize); 115 116 // Extract detailed block information 117 std::unordered_map<device::DeviceMemPtr, std::unordered_map<std::string, size_t>> ExtractBlocksListInfo( 118 const MemStatusManagerPtr &mem_mng) const; 119 120 // The statistics information. 121 size_t TotalMemStatistics() const; 122 size_t TotalUsedMemStatistics() const; 123 size_t TotalUsedByEventMemStatistics() const; 124 size_t TotalIdleMemStatistics() const; 125 size_t TotalEagerFreeMemStatistics() const; 126 size_t UsedMemPeakStatistics() const; 127 size_t MaxMemAllocatedStatistics() const; 128 size_t MaxMemReservedStatistics() const; 129 size_t ActualPeakStatistics() const; 130 std::unordered_map<std::string, std::size_t> BlockCountsStatistics() const; 131 std::unordered_map<std::string, std::size_t> BlockUnitSizeStatistics() const; 132 std::unordered_map<device::DeviceMemPtr, std::unordered_map<std::string, size_t>> CommonMemBlocksInfoStatistics() 133 const; 134 std::unordered_map<device::DeviceMemPtr, std::unordered_map<std::string, size_t>> PersistentMemBlocksInfoStatistics() 135 const; 136 void ResetMaxMemReserved() const; 137 void ResetMaxMemAllocated() const; 138 139 // Display the brief state information of memory block and memory buf. 140 void DumpDynamicMemPoolStateInfo(); 141 // Display the detailed debug information of memory block and memory buf. 142 void DumpDynamicMemPoolDebugInfo(); 143 144 void DefragMemory(); 145 146 // The related interface of device memory real operation, needs override by device type. 147 virtual size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) = 0; 148 virtual bool FreeDeviceMem(const DeviceMemPtr &addr) = 0; 149 virtual size_t free_mem_size() = 0; total_mem_size()150 virtual uint64_t total_mem_size() const { return 0; } 151 // Set mem pool block size 152 virtual void SetMemPoolBlockSize(size_t available_device_mem_size); GetMaxUsedMemSize()153 virtual size_t GetMaxUsedMemSize() const { return 0; } 154 155 // Element in vector : memory_stream_id, address 156 bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, 157 const std::vector<std::pair<uint32_t, DeviceMemPtr>> &memory_stream_addresses, 158 const DeviceEventPtr &event); 159 bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id); 160 bool WaitEvent(int64_t task_id_on_stream, uint32_t memory_stream_id); 161 bool SyncAllEvents(); GetMemoryPoolType()162 virtual std::string GetMemoryPoolType() const { return "Other"; } 163 #ifdef WITH_BACKEND 164 165 protected: 166 #endif common_mem()167 const MemStatusManagerPtr &common_mem() const { return common_mem_; } persistent_mem()168 const MemStatusManagerPtr &persistent_mem() const { return persistent_mem_; } 169 void *GetMinUsingMemoryAddr() const; 170 // The real size by memory alloc aligned. 171 virtual size_t AlignMemorySize(size_t size) const; 172 // Calculate memory block required alloc size when adding the memory block. 173 virtual size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool need_recycle = false); 174 std::set<DeviceMemPtr> mem_bufs_; 175 // The related interface of device memory eager free. IsEnableEagerFree()176 virtual const bool IsEnableEagerFree() const { return false; } IsEnableVmm()177 const bool IsEnableVmm() const { return enable_vmm_; } SetEnableVmm(bool enable_vmm)178 void SetEnableVmm(bool enable_vmm) { enable_vmm_ = enable_vmm; } SyncAllStreams()179 virtual const bool SyncAllStreams() { return false; } AllocDeviceMemByEagerFree(size_t size,DeviceMemPtr * addr)180 virtual size_t AllocDeviceMemByEagerFree(size_t size, DeviceMemPtr *addr) { return 0; } FreeDeviceMemByEagerFree(const DeviceMemPtr addr,const size_t size)181 virtual size_t FreeDeviceMemByEagerFree(const DeviceMemPtr addr, const size_t size) { return 0; } MmapDeviceMem(size_t size,DeviceMemPtr addr)182 virtual size_t MmapDeviceMem(size_t size, DeviceMemPtr addr) { return 0; } 183 const size_t FreeIdleMemsByEagerFree(); 184 #ifdef WITH_BACKEND 185 186 private: 187 #endif 188 // Find available memory buf from total pools by status, which contains idle and eager free. 189 DeviceMemPtr FindAvailableMemBuf(size_t size, bool from_persistent_mem, uint32_t stream_id); 190 // Find the target status memory buf from total pools by aligned size when memory alloc. 191 DeviceMemPtr FindMemBufByStatus(size_t size, bool from_persistent_mem, DynamicMemBufStatus target_status, 192 uint32_t stream_id); 193 // Find the target status memory buf from specific pool by aligned size when memory alloc. 194 DeviceMemPtr FindMemBufInSpecifiedMng(size_t size, bool from_persistent_mem, DynamicMemBufStatus target_status, 195 uint32_t stream_id); 196 197 // Add memory block and memory. 198 DeviceMemPtr AddMemBlockAndMemBuf(size_t size, bool from_persistent_mem, bool need_recycle, uint32_t stream_id); 199 // Add memory block and memory buf with eager free api. 200 DeviceMemPtr AddMemBlockAndMemBufByEagerFree(size_t size, bool from_persistent_mem, uint32_t stream_id); 201 // Add the memory block and memory buf when memory alloc not find the available memory buf. 202 DeviceMemPtr CreateMemBlockAndMemBuf(size_t size, bool from_persistent_mem, DeviceMemPtr source_addr, 203 size_t source_size, DynamicMemBufStatus mem_buf_status, uint32_t stream_id); 204 205 // Judge whether need split the memory buf by alloc size and memory buf size. 206 bool IsSplit(size_t tensor_size, size_t mem_buf_size) const; 207 // Split the memory buf by alloc size. 208 void SplitMemBuf(size_t size, const DynamicMemBufPtr &mem_buf, const MemStatusManagerPtr &mem_mng, 209 uint32_t stream_id); 210 211 // Find the memory block by device address. 212 DynamicMemBlockPtr FindMemBlock(const DeviceMemPtr &device_addr, const MemStatusManagerPtr &mem_mng) const; 213 // The Comparator of memory block by device address, because memory blocks are arranged in order by device address. 214 static bool CmpMemBlock(const DeviceMemPtr &device_addr, const DynamicMemBlockPtr &mem_block); 215 216 // Free memory inner with no lock, the caller need lock. 217 void FreeTensorMemInner(const DeviceMemPtr &device_addr); 218 // Pre combine mem buf, return false when mem buf can not combine. 219 bool PreCombineMemBuf(const DynamicMemBufPtr &mem_buf, const MemStatusManagerPtr &mem_mng); 220 // Combine the memory buf when memory free, to avoid the memory fragmentation. 221 void CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceAddrMapMemBuf::iterator &iter, 222 const MemStatusManagerPtr &mem_mng, DynamicMemBufStatus origin_status, 223 DynamicMemBufStatus target_status); 224 // Fetch the mem info by the strict addr. 225 std::tuple<DynamicMemBlockPtr, DeviceAddrMapMemBuf::iterator, MemStatusManagerPtr> FindByStrictAddr( 226 const DeviceMemPtr &device_addr) const; 227 228 // Keep the part memorys by addr. 229 void KeepTensorMemByAddr(const DeviceMemPtr &device_addr, size_t size); 230 std::tuple<DynamicMemBlockPtr, DynamicMemBufPtr, MemStatusManagerPtr> FindByKeepAddr( 231 const DeviceMemPtr &device_addr) const; 232 DynamicMemBufPtr FindMemBufByKeepAddr(const DeviceMemPtr &device_addr, const DynamicMemBlockPtr &mem_block) const; 233 // Sync all events inner without lock. 234 bool SyncAllEventsInner(); 235 236 #ifdef __APPLE__ 237 // There are some problems with using mutex on Mac, use spinlocks instead. 238 SpinLock spin_lock_; 239 #else 240 // Support multi-thread. 241 std::mutex mutex_; 242 #endif 243 MemStatusManagerPtr persistent_mem_{nullptr}; 244 MemStatusManagerPtr common_mem_{nullptr}; 245 // In the graph mode, the unit size set in the context will be modified through the FetchMemUnitSize function, so it 246 // needs to be changed back after that 247 size_t config_unit_size_{kDynamicMemAllocUnitSize}; 248 // Flag for eager free routine. This flag set to false when initializing, and set to true when triggering oom. 249 bool is_trigger_eager_free_{false}; 250 251 // key : <user_stream_id, memory_stream_id> 252 std::unordered_map<std::pair<uint32_t, uint32_t>, std::set<DynamicMemBufPtr>, pair_hash> stream_pair_addresses_; 253 254 bool enable_vmm_{false}; 255 size_t eager_free_count_{0}; 256 size_t last_eager_free_count_{0}; 257 }; 258 259 // Recording information for debugging the memory allocator. 260 struct AllocatorDebugInfo { 261 std::string name_{"Unknown"}; 262 AllocatorType type_{AllocatorType::kOther}; 263 int input_index_{-1}; 264 int output_index_{-1}; 265 }; 266 267 class DynamicMemAllocatorDebugInfo { 268 public: GetDebugInfo()269 static AllocatorDebugInfo &GetDebugInfo() noexcept { return debug_info_; } 270 271 // Set the debug info when memory alloc. 272 static void SetDebugInfo(const std::string &name, AllocatorType type, int input_index = -1, int output_index = -1) { 273 debug_info_.name_ = name; 274 debug_info_.type_ = type; 275 debug_info_.input_index_ = input_index; 276 debug_info_.output_index_ = output_index; 277 } 278 279 private: 280 DynamicMemAllocatorDebugInfo() = default; 281 virtual ~DynamicMemAllocatorDebugInfo() = default; 282 DISABLE_COPY_AND_ASSIGN(DynamicMemAllocatorDebugInfo); 283 284 static thread_local AllocatorDebugInfo debug_info_; 285 }; 286 287 using TaskIdOnStreamEvent = std::pair<int64_t, DeviceEventPtr>; 288 struct DynamicMemBuf { DynamicMemBufDynamicMemBuf289 DynamicMemBuf(DeviceMemPtr addr, DynamicMemBufStatus status, size_t size, uint32_t stream_id) 290 : device_addr_(addr), status_(status), size_(size), stream_id_(stream_id) {} DynamicMemBufDynamicMemBuf291 DynamicMemBuf(DeviceMemPtr addr, DynamicMemBufStatus status, size_t size, uint32_t stream_id, 292 const std::string &allocator_name, AllocatorType allocator_type) 293 : device_addr_(addr), 294 status_(status), 295 size_(size), 296 stream_id_(stream_id), 297 allocator_name_(allocator_name), 298 allocator_type_{allocator_type} {} 299 DynamicMemBuf(const DynamicMemBuf &) = delete; 300 DynamicMemBuf &operator=(const DynamicMemBuf &) = delete; 301 302 // Record event on mem buf. 303 bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, const DeviceEventPtr &event); 304 305 // Release events on mem buf. 306 bool WaitEvent(uint32_t task_id_on_stream, uint32_t user_stream_id); 307 308 // Indidates if mem buf used by event, return true when no event bind on mem buf. 309 bool IsEventNotUsed(); 310 311 // Sync all events that bound on mem buf. 312 bool SyncAllEvents(); 313 314 DeviceMemPtr device_addr_; 315 DynamicMemBufStatus status_; 316 size_t size_; 317 318 uint32_t stream_id_{0}; 319 320 // Debug info. 321 std::string allocator_name_; 322 AllocatorType allocator_type_{AllocatorType::kOther}; 323 324 // Parameter: user_stream_id, list of <task_id_on_stream, event>. 325 std::shared_ptr<std::unordered_map<uint32_t, std::shared_ptr<std::list<TaskIdOnStreamEvent>>>> events_{nullptr}; 326 }; 327 328 class DynamicMemBlock { 329 public: 330 DynamicMemBlock() = delete; DynamicMemBlock(DeviceMemPtr addr_base,size_t size,const uint32_t stream_id)331 DynamicMemBlock(DeviceMemPtr addr_base, size_t size, const uint32_t stream_id) 332 : device_addr_base_(addr_base), mem_block_size_(size), stream_id_(stream_id) {} ~DynamicMemBlock()333 ~DynamicMemBlock() { block_all_mem_buf_map_.clear(); } device_addr()334 const DeviceMemPtr &device_addr() const { return device_addr_base_; } size()335 size_t size() const { return mem_block_size_; } 336 void update_border_addr(DeviceMemPtr left_addr, DeviceMemPtr right_addr); 337 size_t get_actual_peak(); 338 339 #ifdef WITH_BACKEND 340 341 private: 342 #endif 343 friend class DynamicMemPoolBestFit; 344 // MemStatusManager need dump block_all_mem_buf_map_ info, add friend class. 345 friend class MemStatusManager; 346 347 // The map of all memory buf in this memory block by device address. 348 DeviceAddrMapMemBuf block_all_mem_buf_map_; 349 350 DeviceMemPtr device_addr_base_{nullptr}; 351 352 // Max addr 353 DeviceMemPtr max_addr_ = nullptr; 354 // Min addr 355 DeviceMemPtr min_addr_ = nullptr; 356 357 size_t mem_block_size_{0}; 358 const uint32_t stream_id_; 359 }; 360 361 struct DeviceState { 362 // Update peak size. UpdatePeakSizeDeviceState363 void UpdatePeakSize() { 364 size_t total_used_size_ = total_used_mem_size_ + total_used_by_event_mem_size_; 365 size_t temp_used_size_ = temp_total_used_mem_size_ + temp_total_used_by_event_mem_size_; 366 used_mem_peak_size_ = std::max(used_mem_peak_size_, total_used_size_); 367 if (total_used_size_ > temp_used_size_) { 368 temp_used_mem_peak_size_ = std::max(temp_used_mem_peak_size_, total_used_size_ - temp_used_size_); 369 } 370 } 371 372 // Memory allocated from device 373 size_t total_mem_size_{0}; 374 // Memory in use 375 size_t total_used_mem_size_{0}; 376 // Memory in use by event 377 size_t total_used_by_event_mem_size_{0}; 378 // Memory in idle. 379 size_t total_idle_mem_size_{0}; 380 // Memory in eager free. 381 size_t total_eager_free_mem_size_{0}; 382 // Maximum peak memory usage 383 size_t used_mem_peak_size_{0}; 384 // Recorded data for memory in use since reset maximum allocated memory 385 size_t temp_total_used_mem_size_{0}; 386 // Recorded data for memory in use by event since reset maximum allocated memory 387 size_t temp_total_used_by_event_mem_size_{0}; 388 // Recorded data for maximum peak memory usage since reset maximum allocated memory 389 size_t temp_used_mem_peak_size_{0}; 390 // Temporary recorded data for memory reserved since reset maximum reserved memory 391 size_t temp_total_mem_size_{0}; 392 }; 393 394 struct MemStatusManager { EmptyMemStatusManager395 bool Empty() const { return mem_block_list_.empty(); } 396 397 void AddMemBlock(const DynamicMemBlockPtr &mem_block, uint32_t stream_id); 398 399 void DoAddMemBlock(const DynamicMemBlockPtr &mem_block, std::vector<DynamicMemBlockPtr> *mem_block_list); 400 size_t CalActualPeak(); 401 402 SizeMapMemBuf &GetOrCreateMemBufMap(uint32_t stream_id, DynamicMemBufStatus status); 403 404 void AddMemBuf(const DynamicMemBufPtr &mem_buf); 405 406 void RemoveMemBuf(const DynamicMemBufPtr &mem_buf); 407 408 void Clear() noexcept; 409 410 const DeviceState DumpMemBlockDebugInfo(const std::string &mem_type); 411 GetStreamIdsMemStatusManager412 std::vector<uint32_t> GetStreamIds() const { 413 std::vector<uint32_t> stream_ids; 414 for (const auto &iter : mem_blocks_) { 415 (void)stream_ids.emplace_back(iter.first); 416 } 417 return stream_ids; 418 } 419 420 size_t unit_size_{kDynamicMemAllocUnitSize}; 421 // Mem pool state 422 DeviceState mps_; 423 424 std::vector<DynamicMemBlockPtr> mem_block_list_; 425 std::vector<DynamicMemBlockPtr> mem_block_insertion_order_; 426 size_t total_block_size_ = 0; 427 std::unordered_map<uint32_t, std::vector<DynamicMemBlockPtr>> mem_blocks_; 428 std::unordered_map<std::pair<uint32_t, DynamicMemBufStatus>, SizeMapMemBuf, pair_hash> mem_bufs_; 429 }; 430 } // namespace device 431 } // namespace mindspore 432 #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_DYNAMIC_ALLOCATOR_H_ 433