• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_DYNAMIC_ALLOCATOR_H_
18 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_DYNAMIC_ALLOCATOR_H_
19 
20 #include <algorithm>
21 #include <functional>
22 #include <list>
23 #include <map>
24 #include <memory>
25 #include <mutex>
26 #include <set>
27 #include <thread>
28 #include <unordered_map>
29 #include <utility>
30 #include <vector>
31 #include <random>
32 #include <string>
33 #include <tuple>
34 
35 #include "utils/ms_utils.h"
36 #include "include/backend/visible.h"
37 #include "include/common/utils/stream_util.h"
38 #include "ir/device_event.h"
39 #ifdef __APPLE__
40 #include "mindrt/include/async/spinlock.h"
41 #endif
42 
43 namespace mindspore {
44 namespace device {
45 // The status of memory buf.
46 enum class DynamicMemBufStatus : int { kMemBufIdle, kMemBufUsed, kMemBufEagerFree, kMemBufUsedByEvent };
47 // Memory allocator type is used to record the memory classification statistics information.
48 enum class AllocatorType : int { kWeight, kConstantValue, kKernelOutput, kGraphOutput, kWorkspace, kOther };
49 constexpr int kShiftOffset = 2;
50 constexpr int kAllocatorTypeNum = 6;
51 // Alloc memory aligned according to 512 bytes.
52 constexpr size_t kDynamicMemAlignSize = 512;
53 // The minimum unit size (1G) of memory block used for dynamic extend.
54 constexpr size_t kDynamicMemAllocUnitSize = 1024 << 20;
55 
56 // The Comparator of device address from small to large.
57 using DeviceMemPtr = void(*);
58 struct DeviceAddrCmp {
operatorDeviceAddrCmp59   bool operator()(const DeviceMemPtr &addr1, const DeviceMemPtr &addr2) const { return addr1 < addr2; }
60 };
61 
62 // The AllocatorDebugInfo wrapper which is the local thread for the dynamic memory pool.
63 class DynamicMemAllocatorDebugInfo;
64 // Memory buf is the smallest operation object of dynamic memory pool.
65 struct DynamicMemBuf;
66 using DynamicMemBufPtr = std::shared_ptr<DynamicMemBuf>;
67 // Multimap key is the tensor size, for finding the idle memory buf by tensor size.
68 using SizeMapMemBuf = std::multimap<size_t, DynamicMemBufPtr>;
69 // Map key is the device address, for finding the used memory buf in memory block by device address.
70 using DeviceAddrMapMemBuf = std::map<DeviceMemPtr, DynamicMemBufPtr, DeviceAddrCmp>;
71 // Memory block is composed of memory buf.
72 class DynamicMemBlock;
73 using DynamicMemBlockPtr = std::shared_ptr<DynamicMemBlock>;
74 
75 struct MemStatusManager;
76 using MemStatusManagerPtr = std::shared_ptr<MemStatusManager>;
77 
78 // pair has no hash method, need override it.
79 struct pair_hash {
80   template <class L, class R>
operatorpair_hash81   std::size_t operator()(const std::pair<L, R> &param) const {
82     size_t hash = std::hash<L>{}(param.first);
83     hash <<= (sizeof(size_t) << kShiftOffset);
84     hash ^= std::hash<R>{}(param.second);
85     return std::hash<size_t>{}(hash);
86   }
87 };
88 
89 // The main class of dynamic memory pool.
90 class BACKEND_EXPORT DynamicMemPoolBestFit {
91  public:
DynamicMemPoolBestFit()92   DynamicMemPoolBestFit()
93       : persistent_mem_(std::make_shared<MemStatusManager>()), common_mem_(std::make_shared<MemStatusManager>()) {}
94   virtual ~DynamicMemPoolBestFit();
95 
96   // The main program entry of memory alloc.
97   DeviceMemPtr AllocTensorMem(size_t size, bool from_persistent_mem = false, bool need_recycle = false,
98                               uint32_t stream_id = kDefaultStreamIndex);
99   // The main program entry of continuous memory alloc.
100   std::vector<DeviceMemPtr> AllocContinuousTensorMem(const std::vector<size_t> &size_list,
101                                                      uint32_t stream_id = kDefaultStreamIndex);
102   // The main program entry of memory free.
103   void FreeTensorMem(const DeviceMemPtr &device_addr);
104   // The main program entry of part memorys free and part memorys keep.
105   void FreePartTensorMems(const std::vector<DeviceMemPtr> &free_addrs, const std::vector<DeviceMemPtr> &keep_addrs,
106                           const std::vector<size_t> &keep_addr_sizes);
107 
108   // Release the real device memory.
109   void ReleaseDeviceRes();
110 
111   // Get the minimum memory unit size using for dynamic extend.
112   size_t MemAllocUnitSize(bool from_persistent_mem = false) const;
113   // Set the minimum memory unit size using for dynamic extend.
114   void SetMemAllocUintSize(size_t common_size, size_t persist_size = kDynamicMemAllocUnitSize);
115 
116   // Extract detailed block information
117   std::unordered_map<device::DeviceMemPtr, std::unordered_map<std::string, size_t>> ExtractBlocksListInfo(
118     const MemStatusManagerPtr &mem_mng) const;
119 
120   // The statistics information.
121   size_t TotalMemStatistics() const;
122   size_t TotalUsedMemStatistics() const;
123   size_t TotalUsedByEventMemStatistics() const;
124   size_t TotalIdleMemStatistics() const;
125   size_t TotalEagerFreeMemStatistics() const;
126   size_t UsedMemPeakStatistics() const;
127   size_t MaxMemAllocatedStatistics() const;
128   size_t MaxMemReservedStatistics() const;
129   size_t ActualPeakStatistics() const;
130   std::unordered_map<std::string, std::size_t> BlockCountsStatistics() const;
131   std::unordered_map<std::string, std::size_t> BlockUnitSizeStatistics() const;
132   std::unordered_map<device::DeviceMemPtr, std::unordered_map<std::string, size_t>> CommonMemBlocksInfoStatistics()
133     const;
134   std::unordered_map<device::DeviceMemPtr, std::unordered_map<std::string, size_t>> PersistentMemBlocksInfoStatistics()
135     const;
136   void ResetMaxMemReserved() const;
137   void ResetMaxMemAllocated() const;
138 
139   // Display the brief state information of memory block and memory buf.
140   void DumpDynamicMemPoolStateInfo();
141   // Display the detailed debug information of memory block and memory buf.
142   void DumpDynamicMemPoolDebugInfo();
143 
144   void DefragMemory();
145 
146   // The related interface of device memory real operation, needs override by device type.
147   virtual size_t AllocDeviceMem(size_t size, DeviceMemPtr *addr) = 0;
148   virtual bool FreeDeviceMem(const DeviceMemPtr &addr) = 0;
149   virtual size_t free_mem_size() = 0;
total_mem_size()150   virtual uint64_t total_mem_size() const { return 0; }
151   // Set mem pool block size
152   virtual void SetMemPoolBlockSize(size_t available_device_mem_size);
GetMaxUsedMemSize()153   virtual size_t GetMaxUsedMemSize() const { return 0; }
154 
155   // Element in vector : memory_stream_id, address
156   bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id,
157                    const std::vector<std::pair<uint32_t, DeviceMemPtr>> &memory_stream_addresses,
158                    const DeviceEventPtr &event);
159   bool WaitEvent(int64_t task_id_on_stream, uint32_t user_stream_id, uint32_t memory_stream_id);
160   bool WaitEvent(int64_t task_id_on_stream, uint32_t memory_stream_id);
161   bool SyncAllEvents();
GetMemoryPoolType()162   virtual std::string GetMemoryPoolType() const { return "Other"; }
163 #ifdef WITH_BACKEND
164 
165  protected:
166 #endif
common_mem()167   const MemStatusManagerPtr &common_mem() const { return common_mem_; }
persistent_mem()168   const MemStatusManagerPtr &persistent_mem() const { return persistent_mem_; }
169   void *GetMinUsingMemoryAddr() const;
170   // The real size by memory alloc aligned.
171   virtual size_t AlignMemorySize(size_t size) const;
172   // Calculate memory block required alloc size when adding the memory block.
173   virtual size_t CalMemBlockAllocSize(size_t size, bool from_persistent_mem, bool need_recycle = false);
174   std::set<DeviceMemPtr> mem_bufs_;
175   // The related interface of device memory eager free.
IsEnableEagerFree()176   virtual const bool IsEnableEagerFree() const { return false; }
IsEnableVmm()177   const bool IsEnableVmm() const { return enable_vmm_; }
SetEnableVmm(bool enable_vmm)178   void SetEnableVmm(bool enable_vmm) { enable_vmm_ = enable_vmm; }
SyncAllStreams()179   virtual const bool SyncAllStreams() { return false; }
AllocDeviceMemByEagerFree(size_t size,DeviceMemPtr * addr)180   virtual size_t AllocDeviceMemByEagerFree(size_t size, DeviceMemPtr *addr) { return 0; }
FreeDeviceMemByEagerFree(const DeviceMemPtr addr,const size_t size)181   virtual size_t FreeDeviceMemByEagerFree(const DeviceMemPtr addr, const size_t size) { return 0; }
MmapDeviceMem(size_t size,DeviceMemPtr addr)182   virtual size_t MmapDeviceMem(size_t size, DeviceMemPtr addr) { return 0; }
183   const size_t FreeIdleMemsByEagerFree();
184 #ifdef WITH_BACKEND
185 
186  private:
187 #endif
188   // Find available memory buf from total pools by status, which contains idle and eager free.
189   DeviceMemPtr FindAvailableMemBuf(size_t size, bool from_persistent_mem, uint32_t stream_id);
190   // Find the target status memory buf from total pools by aligned size when memory alloc.
191   DeviceMemPtr FindMemBufByStatus(size_t size, bool from_persistent_mem, DynamicMemBufStatus target_status,
192                                   uint32_t stream_id);
193   // Find the target status memory buf from specific pool by aligned size when memory alloc.
194   DeviceMemPtr FindMemBufInSpecifiedMng(size_t size, bool from_persistent_mem, DynamicMemBufStatus target_status,
195                                         uint32_t stream_id);
196 
197   // Add memory block and memory.
198   DeviceMemPtr AddMemBlockAndMemBuf(size_t size, bool from_persistent_mem, bool need_recycle, uint32_t stream_id);
199   // Add memory block and memory buf with eager free api.
200   DeviceMemPtr AddMemBlockAndMemBufByEagerFree(size_t size, bool from_persistent_mem, uint32_t stream_id);
201   // Add the memory block and memory buf when memory alloc not find the available memory buf.
202   DeviceMemPtr CreateMemBlockAndMemBuf(size_t size, bool from_persistent_mem, DeviceMemPtr source_addr,
203                                        size_t source_size, DynamicMemBufStatus mem_buf_status, uint32_t stream_id);
204 
205   // Judge whether need split the memory buf by alloc size and memory buf size.
206   bool IsSplit(size_t tensor_size, size_t mem_buf_size) const;
207   // Split the memory buf by alloc size.
208   void SplitMemBuf(size_t size, const DynamicMemBufPtr &mem_buf, const MemStatusManagerPtr &mem_mng,
209                    uint32_t stream_id);
210 
211   // Find the memory block by device address.
212   DynamicMemBlockPtr FindMemBlock(const DeviceMemPtr &device_addr, const MemStatusManagerPtr &mem_mng) const;
213   // The Comparator of memory block by device address, because memory blocks are arranged in order by device address.
214   static bool CmpMemBlock(const DeviceMemPtr &device_addr, const DynamicMemBlockPtr &mem_block);
215 
216   // Free memory inner with no lock, the caller need lock.
217   void FreeTensorMemInner(const DeviceMemPtr &device_addr);
218   // Pre combine mem buf, return false when mem buf can not combine.
219   bool PreCombineMemBuf(const DynamicMemBufPtr &mem_buf, const MemStatusManagerPtr &mem_mng);
220   // Combine the memory buf when memory free, to avoid the memory fragmentation.
221   void CombineMemBuf(const DynamicMemBlockPtr &mem_block, const DeviceAddrMapMemBuf::iterator &iter,
222                      const MemStatusManagerPtr &mem_mng, DynamicMemBufStatus origin_status,
223                      DynamicMemBufStatus target_status);
224   // Fetch the mem info by the strict addr.
225   std::tuple<DynamicMemBlockPtr, DeviceAddrMapMemBuf::iterator, MemStatusManagerPtr> FindByStrictAddr(
226     const DeviceMemPtr &device_addr) const;
227 
228   // Keep the part memorys by addr.
229   void KeepTensorMemByAddr(const DeviceMemPtr &device_addr, size_t size);
230   std::tuple<DynamicMemBlockPtr, DynamicMemBufPtr, MemStatusManagerPtr> FindByKeepAddr(
231     const DeviceMemPtr &device_addr) const;
232   DynamicMemBufPtr FindMemBufByKeepAddr(const DeviceMemPtr &device_addr, const DynamicMemBlockPtr &mem_block) const;
233   // Sync all events inner without lock.
234   bool SyncAllEventsInner();
235 
236 #ifdef __APPLE__
237   // There are some problems with using mutex on Mac, use spinlocks instead.
238   SpinLock spin_lock_;
239 #else
240   // Support multi-thread.
241   std::mutex mutex_;
242 #endif
243   MemStatusManagerPtr persistent_mem_{nullptr};
244   MemStatusManagerPtr common_mem_{nullptr};
245   // In the graph mode, the unit size set in the context will be modified through the FetchMemUnitSize function, so it
246   // needs to be changed back after that
247   size_t config_unit_size_{kDynamicMemAllocUnitSize};
248   // Flag for eager free routine. This flag set to false when initializing, and set to true when triggering oom.
249   bool is_trigger_eager_free_{false};
250 
251   // key : <user_stream_id, memory_stream_id>
252   std::unordered_map<std::pair<uint32_t, uint32_t>, std::set<DynamicMemBufPtr>, pair_hash> stream_pair_addresses_;
253 
254   bool enable_vmm_{false};
255   size_t eager_free_count_{0};
256   size_t last_eager_free_count_{0};
257 };
258 
259 // Recording information for debugging the memory allocator.
260 struct AllocatorDebugInfo {
261   std::string name_{"Unknown"};
262   AllocatorType type_{AllocatorType::kOther};
263   int input_index_{-1};
264   int output_index_{-1};
265 };
266 
267 class DynamicMemAllocatorDebugInfo {
268  public:
GetDebugInfo()269   static AllocatorDebugInfo &GetDebugInfo() noexcept { return debug_info_; }
270 
271   // Set the debug info when memory alloc.
272   static void SetDebugInfo(const std::string &name, AllocatorType type, int input_index = -1, int output_index = -1) {
273     debug_info_.name_ = name;
274     debug_info_.type_ = type;
275     debug_info_.input_index_ = input_index;
276     debug_info_.output_index_ = output_index;
277   }
278 
279  private:
280   DynamicMemAllocatorDebugInfo() = default;
281   virtual ~DynamicMemAllocatorDebugInfo() = default;
282   DISABLE_COPY_AND_ASSIGN(DynamicMemAllocatorDebugInfo);
283 
284   static thread_local AllocatorDebugInfo debug_info_;
285 };
286 
287 using TaskIdOnStreamEvent = std::pair<int64_t, DeviceEventPtr>;
288 struct DynamicMemBuf {
DynamicMemBufDynamicMemBuf289   DynamicMemBuf(DeviceMemPtr addr, DynamicMemBufStatus status, size_t size, uint32_t stream_id)
290       : device_addr_(addr), status_(status), size_(size), stream_id_(stream_id) {}
DynamicMemBufDynamicMemBuf291   DynamicMemBuf(DeviceMemPtr addr, DynamicMemBufStatus status, size_t size, uint32_t stream_id,
292                 const std::string &allocator_name, AllocatorType allocator_type)
293       : device_addr_(addr),
294         status_(status),
295         size_(size),
296         stream_id_(stream_id),
297         allocator_name_(allocator_name),
298         allocator_type_{allocator_type} {}
299   DynamicMemBuf(const DynamicMemBuf &) = delete;
300   DynamicMemBuf &operator=(const DynamicMemBuf &) = delete;
301 
302   // Record event on mem buf.
303   bool RecordEvent(int64_t task_id_on_stream, uint32_t user_stream_id, const DeviceEventPtr &event);
304 
305   // Release events on mem buf.
306   bool WaitEvent(uint32_t task_id_on_stream, uint32_t user_stream_id);
307 
308   // Indidates if mem buf used by event, return true when no event bind on mem buf.
309   bool IsEventNotUsed();
310 
311   // Sync all events that bound on mem buf.
312   bool SyncAllEvents();
313 
314   DeviceMemPtr device_addr_;
315   DynamicMemBufStatus status_;
316   size_t size_;
317 
318   uint32_t stream_id_{0};
319 
320   // Debug info.
321   std::string allocator_name_;
322   AllocatorType allocator_type_{AllocatorType::kOther};
323 
324   // Parameter: user_stream_id, list of <task_id_on_stream, event>.
325   std::shared_ptr<std::unordered_map<uint32_t, std::shared_ptr<std::list<TaskIdOnStreamEvent>>>> events_{nullptr};
326 };
327 
328 class DynamicMemBlock {
329  public:
330   DynamicMemBlock() = delete;
DynamicMemBlock(DeviceMemPtr addr_base,size_t size,const uint32_t stream_id)331   DynamicMemBlock(DeviceMemPtr addr_base, size_t size, const uint32_t stream_id)
332       : device_addr_base_(addr_base), mem_block_size_(size), stream_id_(stream_id) {}
~DynamicMemBlock()333   ~DynamicMemBlock() { block_all_mem_buf_map_.clear(); }
device_addr()334   const DeviceMemPtr &device_addr() const { return device_addr_base_; }
size()335   size_t size() const { return mem_block_size_; }
336   void update_border_addr(DeviceMemPtr left_addr, DeviceMemPtr right_addr);
337   size_t get_actual_peak();
338 
339 #ifdef WITH_BACKEND
340 
341  private:
342 #endif
343   friend class DynamicMemPoolBestFit;
344   // MemStatusManager need dump block_all_mem_buf_map_ info, add friend class.
345   friend class MemStatusManager;
346 
347   // The map of all memory buf in this memory block by device address.
348   DeviceAddrMapMemBuf block_all_mem_buf_map_;
349 
350   DeviceMemPtr device_addr_base_{nullptr};
351 
352   // Max addr
353   DeviceMemPtr max_addr_ = nullptr;
354   // Min addr
355   DeviceMemPtr min_addr_ = nullptr;
356 
357   size_t mem_block_size_{0};
358   const uint32_t stream_id_;
359 };
360 
361 struct DeviceState {
362   // Update peak size.
UpdatePeakSizeDeviceState363   void UpdatePeakSize() {
364     size_t total_used_size_ = total_used_mem_size_ + total_used_by_event_mem_size_;
365     size_t temp_used_size_ = temp_total_used_mem_size_ + temp_total_used_by_event_mem_size_;
366     used_mem_peak_size_ = std::max(used_mem_peak_size_, total_used_size_);
367     if (total_used_size_ > temp_used_size_) {
368       temp_used_mem_peak_size_ = std::max(temp_used_mem_peak_size_, total_used_size_ - temp_used_size_);
369     }
370   }
371 
372   // Memory allocated from device
373   size_t total_mem_size_{0};
374   // Memory in use
375   size_t total_used_mem_size_{0};
376   // Memory in use by event
377   size_t total_used_by_event_mem_size_{0};
378   // Memory in idle.
379   size_t total_idle_mem_size_{0};
380   // Memory in eager free.
381   size_t total_eager_free_mem_size_{0};
382   // Maximum peak memory usage
383   size_t used_mem_peak_size_{0};
384   // Recorded data for memory in use since reset maximum allocated memory
385   size_t temp_total_used_mem_size_{0};
386   // Recorded data for memory in use by event since reset maximum allocated memory
387   size_t temp_total_used_by_event_mem_size_{0};
388   // Recorded data for maximum peak memory usage since reset maximum allocated memory
389   size_t temp_used_mem_peak_size_{0};
390   // Temporary recorded data for memory reserved since reset maximum reserved memory
391   size_t temp_total_mem_size_{0};
392 };
393 
394 struct MemStatusManager {
EmptyMemStatusManager395   bool Empty() const { return mem_block_list_.empty(); }
396 
397   void AddMemBlock(const DynamicMemBlockPtr &mem_block, uint32_t stream_id);
398 
399   void DoAddMemBlock(const DynamicMemBlockPtr &mem_block, std::vector<DynamicMemBlockPtr> *mem_block_list);
400   size_t CalActualPeak();
401 
402   SizeMapMemBuf &GetOrCreateMemBufMap(uint32_t stream_id, DynamicMemBufStatus status);
403 
404   void AddMemBuf(const DynamicMemBufPtr &mem_buf);
405 
406   void RemoveMemBuf(const DynamicMemBufPtr &mem_buf);
407 
408   void Clear() noexcept;
409 
410   const DeviceState DumpMemBlockDebugInfo(const std::string &mem_type);
411 
GetStreamIdsMemStatusManager412   std::vector<uint32_t> GetStreamIds() const {
413     std::vector<uint32_t> stream_ids;
414     for (const auto &iter : mem_blocks_) {
415       (void)stream_ids.emplace_back(iter.first);
416     }
417     return stream_ids;
418   }
419 
420   size_t unit_size_{kDynamicMemAllocUnitSize};
421   // Mem pool state
422   DeviceState mps_;
423 
424   std::vector<DynamicMemBlockPtr> mem_block_list_;
425   std::vector<DynamicMemBlockPtr> mem_block_insertion_order_;
426   size_t total_block_size_ = 0;
427   std::unordered_map<uint32_t, std::vector<DynamicMemBlockPtr>> mem_blocks_;
428   std::unordered_map<std::pair<uint32_t, DynamicMemBufStatus>, SizeMapMemBuf, pair_hash> mem_bufs_;
429 };
430 }  // namespace device
431 }  // namespace mindspore
432 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_MEM_REUSE_MEM_DYNAMIC_ALLOCATOR_H_
433