1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_ 17 #define TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_ 18 19 #include <array> 20 #include <memory> 21 #include <string> 22 #include <unordered_map> 23 #include <vector> 24 25 #include "tensorflow/core/common_runtime/allocator_retry.h" 26 #include "tensorflow/core/common_runtime/shared_counter.h" 27 #include "tensorflow/core/framework/allocator.h" 28 #include "tensorflow/core/lib/gtl/stl_util.h" 29 #include "tensorflow/core/lib/strings/strcat.h" 30 #include "tensorflow/core/platform/macros.h" 31 #include "tensorflow/core/platform/mutex.h" 32 #include "tensorflow/core/platform/thread_annotations.h" 33 #include "tensorflow/core/platform/types.h" 34 #include "tensorflow/core/protobuf/config.pb.h" 35 36 namespace tensorflow { 37 38 // A memory allocator that implements a 'best-fit with coalescing' 39 // algorithm. This is essentially a very simple version of Doug Lea's 40 // malloc (dlmalloc). 41 // 42 // The goal of this allocator is to support defragmentation via 43 // coalescing. One assumption we make is that the process using this 44 // allocator owns pretty much all of the memory, and that nearly 45 // all requests to allocate memory go through this interface. 46 class BFCAllocator : public Allocator { 47 public: 48 // Takes ownership of sub_allocator. 49 BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, 50 bool allow_growth, const string& name); 51 ~BFCAllocator() override; 52 Name()53 string Name() override { return name_; } 54 AllocateRaw(size_t alignment,size_t num_bytes)55 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 56 return AllocateRaw(alignment, num_bytes, AllocationAttributes()); 57 } 58 59 void* AllocateRaw(size_t alignment, size_t num_bytes, 60 const AllocationAttributes& allocation_attr) override; 61 62 void DeallocateRaw(void* ptr) override; 63 64 bool TracksAllocationSizes() override; 65 66 size_t RequestedSize(const void* ptr) override; 67 68 size_t AllocatedSize(const void* ptr) override; 69 70 int64 AllocationId(const void* ptr) override; 71 72 absl::optional<AllocatorStats> GetStats() override; 73 74 void ClearStats() override; 75 SetTimingCounter(SharedCounter * sc)76 void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; } 77 78 private: 79 struct Bin; 80 81 void* AllocateRawInternal(size_t alignment, size_t num_bytes, 82 bool dump_log_on_failure, 83 uint64 freed_before_count); 84 85 void* AllocateRawInternalWithRetry( 86 size_t alignment, size_t num_bytes, 87 const AllocationAttributes& allocation_attr); 88 89 void DeallocateRawInternal(void* ptr); 90 91 // A ChunkHandle is an index into the chunks_ vector in BFCAllocator 92 // kInvalidChunkHandle means an invalid chunk 93 typedef size_t ChunkHandle; 94 static const int kInvalidChunkHandle = -1; 95 96 typedef int BinNum; 97 static const int kInvalidBinNum = -1; 98 static const int kNumBins = 21; 99 100 // A Chunk points to a piece of memory that's either entirely free or entirely 101 // in use by one user memory allocation. 102 // 103 // An AllocationRegion's memory is split up into one or more disjoint Chunks, 104 // which together cover the whole region without gaps. Chunks participate in 105 // a doubly-linked list, and the prev/next pointers point to the physically 106 // adjacent chunks. 107 // 108 // Since a chunk cannot be partially in use, we may need to split a free chunk 109 // in order to service a user allocation. We always merge adjacent free 110 // chunks. 111 // 112 // Chunks contain information about whether they are in use or whether they 113 // are free, and contain a pointer to the bin they are in. 114 struct Chunk { 115 size_t size = 0; // Full size of buffer. 116 117 // We sometimes give chunks that are larger than needed to reduce 118 // fragmentation. requested_size keeps track of what the client 119 // actually wanted so we can understand whether our splitting 120 // strategy is efficient. 121 size_t requested_size = 0; 122 123 // allocation_id is set to -1 when the chunk is not in use. It is assigned a 124 // value greater than zero before the chunk is returned from 125 // AllocateRaw, and this value is unique among values assigned by 126 // the parent allocator. 127 int64 allocation_id = -1; 128 void* ptr = nullptr; // pointer to granted subbuffer. 129 130 // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly 131 // preceding the memory used by this chunk. E.g., It should start 132 // at 'ptr - prev->size' 133 ChunkHandle prev = kInvalidChunkHandle; 134 135 // If not kInvalidChunkHandle, the memory referred to by 'next' is directly 136 // following the memory used by this chunk. E.g., It should be at 137 // 'ptr + size' 138 ChunkHandle next = kInvalidChunkHandle; 139 140 // What bin are we in? 141 BinNum bin_num = kInvalidBinNum; 142 143 // Optional count when this chunk was most recently made free. 144 uint64 freed_count = 0; 145 in_useChunk146 bool in_use() const { return allocation_id != -1; } 147 DebugStringChunk148 string DebugString(BFCAllocator* a, 149 bool recurse) NO_THREAD_SAFETY_ANALYSIS { 150 string dbg; 151 strings::StrAppend( 152 &dbg, " Size: ", strings::HumanReadableNumBytes(size), 153 " | Requested Size: ", strings::HumanReadableNumBytes(requested_size), 154 " | in_use: ", in_use()); 155 if (recurse && prev != BFCAllocator::kInvalidChunkHandle) { 156 Chunk* p = a->ChunkFromHandle(prev); 157 strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false)); 158 } 159 if (recurse && next != BFCAllocator::kInvalidChunkHandle) { 160 Chunk* n = a->ChunkFromHandle(next); 161 strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false)); 162 } 163 return dbg; 164 } 165 }; 166 167 // A Bin is a collection of similar-sized free chunks. 168 struct Bin { 169 // All chunks in this bin have >= bin_size memory. 170 size_t bin_size = 0; 171 172 struct ChunkComparator { ChunkComparatorBin::ChunkComparator173 explicit ChunkComparator(BFCAllocator* allocator) 174 : allocator_(allocator) {} 175 // Sort first by size and then use pointer address as a tie breaker. operatorBin::ChunkComparator176 bool operator()(const ChunkHandle ha, 177 const ChunkHandle hb) const NO_THREAD_SAFETY_ANALYSIS { 178 const Chunk* a = allocator_->ChunkFromHandle(ha); 179 const Chunk* b = allocator_->ChunkFromHandle(hb); 180 if (a->size != b->size) { 181 return a->size < b->size; 182 } 183 return a->ptr < b->ptr; 184 } 185 186 private: 187 BFCAllocator* allocator_; // The parent allocator 188 }; 189 190 typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet; 191 // List of free chunks within the bin, sorted by chunk size. 192 // Chunk * not owned. 193 FreeChunkSet free_chunks; BinBin194 Bin(BFCAllocator* allocator, size_t bs) 195 : bin_size(bs), free_chunks(ChunkComparator(allocator)) {} 196 }; 197 198 static const size_t kMinAllocationBits = 8; 199 static const size_t kMinAllocationSize = 1 << kMinAllocationBits; 200 201 // BFCAllocator allocates memory into a collection of disjoint 202 // AllocationRegions. Each AllocationRegion corresponds to one call to 203 // SubAllocator::Alloc(). 204 // 205 // An AllocationRegion contains one or more Chunks, covering all of its 206 // memory. Its primary job is to map a pointers to ChunkHandles. 207 // 208 // This class is thread-compatible. 209 class AllocationRegion { 210 public: AllocationRegion(void * ptr,size_t memory_size)211 AllocationRegion(void* ptr, size_t memory_size) 212 : ptr_(ptr), 213 memory_size_(memory_size), 214 end_ptr_( 215 static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) { 216 DCHECK_EQ(0, memory_size % kMinAllocationSize); 217 const size_t n_handles = 218 (memory_size + kMinAllocationSize - 1) / kMinAllocationSize; 219 handles_.reset(new ChunkHandle[n_handles]); 220 for (size_t i = 0; i < n_handles; i++) { 221 handles_[i] = kInvalidChunkHandle; 222 } 223 } 224 225 AllocationRegion() = default; AllocationRegion(AllocationRegion && other)226 AllocationRegion(AllocationRegion&& other) { Swap(other); } 227 AllocationRegion& operator=(AllocationRegion&& other) { 228 Swap(other); 229 return *this; 230 } 231 ptr()232 void* ptr() const { return ptr_; } end_ptr()233 void* end_ptr() const { return end_ptr_; } memory_size()234 size_t memory_size() const { return memory_size_; } get_handle(const void * p)235 ChunkHandle get_handle(const void* p) const { 236 return handles_[IndexFor(p)]; 237 } set_handle(const void * p,ChunkHandle h)238 void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; } erase(const void * p)239 void erase(const void* p) { set_handle(p, kInvalidChunkHandle); } 240 241 private: Swap(AllocationRegion & other)242 void Swap(AllocationRegion& other) { 243 std::swap(ptr_, other.ptr_); 244 std::swap(memory_size_, other.memory_size_); 245 std::swap(end_ptr_, other.end_ptr_); 246 std::swap(handles_, other.handles_); 247 } 248 IndexFor(const void * p)249 int IndexFor(const void* p) const { 250 std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p); 251 std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_); 252 DCHECK_GE(p_int, base_int); 253 DCHECK_LT(p_int, base_int + memory_size_); 254 return static_cast<int>(((p_int - base_int) >> kMinAllocationBits)); 255 } 256 257 // Metadata about the allocation region. 258 void* ptr_ = nullptr; 259 size_t memory_size_ = 0; 260 void* end_ptr_ = nullptr; 261 262 // Array of size "memory_size / kMinAllocationSize". It is 263 // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle 264 // for the memory allocation represented by "p" 265 std::unique_ptr<ChunkHandle[]> handles_; 266 267 TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion); 268 }; 269 270 // RegionManager aggregates one or more "AllocationRegions" and provides 271 // a layer of indirection from pointers to the underlying ChunkHandle, 272 // allowing allocation across multiple discontiguous memory regions. 273 // 274 // This class is thread-compatible. 275 class RegionManager { 276 public: RegionManager()277 RegionManager() {} ~RegionManager()278 ~RegionManager() {} 279 AddAllocationRegion(void * ptr,size_t memory_size)280 void AddAllocationRegion(void* ptr, size_t memory_size) { 281 // Insert sorted by end_ptr 282 auto entry = 283 std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); 284 regions_.insert(entry, AllocationRegion(ptr, memory_size)); 285 } 286 get_handle(const void * p)287 ChunkHandle get_handle(const void* p) const { 288 return RegionFor(p)->get_handle(p); 289 } 290 set_handle(const void * p,ChunkHandle h)291 void set_handle(const void* p, ChunkHandle h) { 292 return MutableRegionFor(p)->set_handle(p, h); 293 } erase(const void * p)294 void erase(const void* p) { return MutableRegionFor(p)->erase(p); } 295 regions()296 const std::vector<AllocationRegion>& regions() const { return regions_; } 297 298 private: Comparator(const void * ptr,const AllocationRegion & other)299 static bool Comparator(const void* ptr, const AllocationRegion& other) { 300 return ptr < other.end_ptr(); 301 } 302 MutableRegionFor(const void * p)303 AllocationRegion* MutableRegionFor(const void* p) { 304 return const_cast<AllocationRegion*>(RegionFor(p)); 305 } 306 RegionFor(const void * p)307 const AllocationRegion* RegionFor(const void* p) const { 308 auto entry = 309 std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator); 310 311 if (entry != regions_.end()) { 312 return &(*entry); 313 } 314 315 LOG(FATAL) << "Could not find Region for " << p; 316 return nullptr; 317 } 318 319 private: 320 std::vector<AllocationRegion> regions_; 321 }; 322 323 // Returns 'bytes' rounded up to the next highest kMinAllocationSize. 324 static size_t RoundedBytes(size_t bytes); 325 326 // Try to add a new memory region that can satisfy an allocation of 327 // 'rounded_bytes' bytes. Returns true on success and false on 328 // failure. 329 bool Extend(size_t alignment, size_t rounded_bytes) 330 EXCLUSIVE_LOCKS_REQUIRED(lock_); 331 332 // Returns a pointer to an underlying allocated chunk of size 333 // 'rounded_bytes'. 334 void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, 335 uint64 freed_before) EXCLUSIVE_LOCKS_REQUIRED(lock_); 336 337 // Splits the chunk specified by 'h' into two chunks, one at least 338 // of size 'num_bytes'. 339 void SplitChunk(ChunkHandle h, size_t num_bytes) 340 EXCLUSIVE_LOCKS_REQUIRED(lock_); 341 342 // Merges the two chunk handles. Requires that the chunks are 343 // contiguous in their allocation. 344 void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_); 345 346 // Frees the memory represented by 'h', coalescing the chunk if 347 // possible. 348 void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 349 350 // Adds the chunk 'h' to the proper free bin. 351 void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 352 353 // Removes the free chunk pointed to by 'c' from the set free_chunks. 354 void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks, 355 const Bin::FreeChunkSet::iterator& c) 356 EXCLUSIVE_LOCKS_REQUIRED(lock_); 357 358 // Removes a free chunk from the bin. 359 void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 360 361 // Removes the chunk metadata represented by 'h'. 362 void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 363 364 string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_); 365 void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); 366 367 ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_); 368 void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 369 370 Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 371 372 // Information about a Bin that is useful for debugging. 373 struct BinDebugInfo { 374 size_t total_bytes_in_use = 0; 375 size_t total_bytes_in_bin = 0; 376 size_t total_requested_bytes_in_use = 0; 377 size_t total_chunks_in_use = 0; 378 size_t total_chunks_in_bin = 0; 379 }; 380 381 // Computes and returns a BinDebugInfo for each Bin. 382 std::array<BinDebugInfo, kNumBins> get_bin_debug_info() 383 EXCLUSIVE_LOCKS_REQUIRED(lock_); 384 385 AllocatorRetry retry_helper_; 386 387 // Structures immutable after construction 388 size_t memory_limit_ = 0; 389 Log2FloorNonZeroSlow(uint64 n)390 inline int Log2FloorNonZeroSlow(uint64 n) { 391 int r = 0; 392 while (n > 0) { 393 r++; 394 n >>= 1; 395 } 396 return r - 1; 397 } 398 399 // Returns floor(log2(n)). Log2FloorNonZero(uint64 n)400 inline int Log2FloorNonZero(uint64 n) { 401 #if defined(__GNUC__) 402 return 63 ^ __builtin_clzll(n); 403 #elif defined(PLATFORM_WINDOWS) && (_WIN64) 404 unsigned long index; 405 _BitScanReverse64(&index, n); 406 return index; 407 #else 408 return Log2FloorNonZeroSlow(n); 409 #endif 410 } 411 412 // Map from bin size to Bin BinFromIndex(BinNum index)413 Bin* BinFromIndex(BinNum index) { 414 return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)])); 415 } BinNumToSize(BinNum index)416 size_t BinNumToSize(BinNum index) { 417 return static_cast<size_t>(256) << index; 418 } BinNumForSize(size_t bytes)419 BinNum BinNumForSize(size_t bytes) { 420 uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits; 421 int b = std::min(kNumBins - 1, Log2FloorNonZero(v)); 422 return b; 423 } BinForSize(size_t bytes)424 Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); } 425 426 char bins_space_[sizeof(Bin) * kNumBins]; 427 428 // The size of the current region allocation. 429 size_t curr_region_allocation_bytes_; 430 431 // The total number of allocated bytes by the allocator. 432 size_t total_region_allocated_bytes_ = 0; 433 434 // An indicator that expansion of a region has hit the limits 435 // of the available memory. 436 bool started_backpedal_ = false; 437 438 std::unique_ptr<SubAllocator> sub_allocator_; 439 string name_; 440 SharedCounter* timing_counter_ = nullptr; 441 442 // Structures mutable after construction 443 mutable mutex lock_; 444 RegionManager region_manager_ GUARDED_BY(lock_); 445 446 std::vector<Chunk> chunks_ GUARDED_BY(lock_); 447 448 // Pointer to head of linked list of free Chunks 449 ChunkHandle free_chunks_list_ GUARDED_BY(lock_); 450 451 // Counter containing the next unique identifier to assign to a 452 // newly-created chunk. 453 int64 next_allocation_id_ GUARDED_BY(lock_); 454 455 // Stats. 456 AllocatorStats stats_ GUARDED_BY(lock_); 457 458 friend class GPUBFCAllocatorPrivateMethodsTest; 459 TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator); 460 }; 461 462 } // namespace tensorflow 463 464 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_ 465