• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/bfc_allocator.h"
17 
18 #include <atomic>
19 
20 #include "absl/strings/string_view.h"
21 #include "tensorflow/core/common_runtime/allocator_retry.h"
22 #include "tensorflow/core/lib/core/bits.h"
23 #include "tensorflow/core/lib/strings/numbers.h"
24 #include "tensorflow/core/lib/strings/str_util.h"
25 #include "tensorflow/core/lib/strings/strcat.h"
26 #include "tensorflow/core/platform/file_system.h"
27 #include "tensorflow/core/platform/logging.h"
28 #include "tensorflow/core/platform/mutex.h"
29 #ifdef TENSORFLOW_MEM_DEBUG
30 #include "tensorflow/core/platform/stacktrace.h"
31 #endif
32 #include "tensorflow/core/framework/tensor_shape.h"
33 #include "tensorflow/core/platform/types.h"
34 #include "tensorflow/core/profiler/lib/traceme.h"
35 #include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
36 
37 namespace tensorflow {
38 
39 constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle;
40 
BFCAllocator(SubAllocator * sub_allocator,size_t total_memory,bool allow_growth,const string & name,bool garbage_collection)41 BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
42                            bool allow_growth, const string& name,
43                            bool garbage_collection)
44     : garbage_collection_(garbage_collection),
45       coalesce_regions_(sub_allocator->SupportsCoalescing()),
46       sub_allocator_(sub_allocator),
47       name_(name),
48       free_chunks_list_(kInvalidChunkHandle),
49       next_allocation_id_(1) {
50   if (allow_growth) {
51     // 2MiB smallest initial allocation, unless total memory available
52     // is less.
53     curr_region_allocation_bytes_ =
54         RoundedBytes(std::min(total_memory, size_t{2 << 20}));
55   } else {
56     curr_region_allocation_bytes_ = RoundedBytes(total_memory);
57   }
58 
59   // Allocate the requested amount of memory.
60   memory_limit_ = total_memory;
61   stats_.bytes_limit = static_cast<int64>(total_memory);
62 
63   // Create a bunch of bins of various good sizes.
64 
65   // We create bins to fit all possible ranges that cover the
66   // memory_limit_ starting from allocations up to 256 bytes to
67   // allocations up to (and including) the memory limit.
68   VLOG(1) << "Creating new BFCAllocator named: " << name;
69   for (BinNum b = 0; b < kNumBins; b++) {
70     size_t bin_size = BinNumToSize(b);
71     VLOG(1) << "Creating bin of max chunk size "
72             << strings::HumanReadableNumBytes(bin_size);
73     new (BinFromIndex(b)) Bin(this, bin_size);
74     CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
75     CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
76     CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
77     if (b + 1 < kNumBins) {
78       CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
79     }
80   }
81 }
82 
~BFCAllocator()83 BFCAllocator::~BFCAllocator() {
84   // Return memory back.
85   VLOG(2) << "Number of regions allocated: "
86           << region_manager_.regions().size();
87   for (const auto& region : region_manager_.regions()) {
88     sub_allocator_->Free(region.ptr(), region.memory_size());
89   }
90 
91   for (BinNum b = 0; b < kNumBins; b++) {
92     BinFromIndex(b)->~Bin();
93   }
94 }
95 
ChunkFromHandle(ChunkHandle h)96 BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
97   DCHECK_GE(h, 0);
98   DCHECK_LT(h, static_cast<int>(chunks_.size()));
99   return &(chunks_[h]);
100 }
101 
ChunkFromHandle(ChunkHandle h) const102 const BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) const {
103   DCHECK_GE(h, 0);
104   DCHECK_LT(h, static_cast<int>(chunks_.size()));
105   return &(chunks_[h]);
106 }
107 
Extend(size_t alignment,size_t rounded_bytes)108 bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
109   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
110   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
111   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
112 
113   // Do we have enough space to handle the client's request?
114   // If not, fail immediately.
115   if (rounded_bytes > available_bytes) {
116     return false;
117   }
118 
119   // If curr_region_allocation_bytes_ is not enough to satisfy the
120   // allocation, keep multiplying by a power of two until that is
121   // sufficient.
122   bool increased_allocation = false;
123   while (rounded_bytes > curr_region_allocation_bytes_) {
124     curr_region_allocation_bytes_ *= 2;
125     increased_allocation = true;
126   }
127 
128   // Try allocating.
129   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
130   size_t bytes_received;
131   void* mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
132   if (mem_addr == nullptr && !started_backpedal_) {
133     // Only backpedal once.
134     started_backpedal_ = true;
135 
136     static constexpr float kBackpedalFactor = 0.9;
137 
138     // Try allocating less memory.
139     while (mem_addr == nullptr) {
140       bytes = RoundedBytes(bytes * kBackpedalFactor);
141       if (bytes < rounded_bytes) break;
142       mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
143     }
144   }
145 
146   if (mem_addr == nullptr) {
147     return false;
148   }
149 
150   if (!increased_allocation) {
151     // Increase the region size of the next required allocation.
152     curr_region_allocation_bytes_ *= 2;
153   }
154 
155   VLOG(1) << "Extending allocation by "
156           << strings::HumanReadableNumBytes(bytes_received) << " bytes for "
157           << Name() << ".";
158 
159   total_region_allocated_bytes_ += bytes_received;
160   VLOG(1) << "Total allocated bytes: "
161           << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
162 
163   VLOG(1) << "Allocated memory at " << mem_addr << " to "
164           << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);
165 
166   AllocationRegion* maybe_extended_region = nullptr;
167   if (coalesce_regions_) {
168     maybe_extended_region =
169         region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
170   } else {
171     region_manager_.AddAllocationRegion(mem_addr, bytes_received);
172   }
173 
174   // Create one large chunk for the whole memory space that will
175   // be chunked later.
176   ChunkHandle h = AllocateChunk();
177   BFCAllocator::Chunk* c = ChunkFromHandle(h);
178   c->ptr = mem_addr;
179   c->size = bytes_received;
180   c->allocation_id = -1;
181   c->prev = kInvalidChunkHandle;
182   c->next = kInvalidChunkHandle;
183   c->freed_at_count = 0;
184 
185   region_manager_.set_handle(c->ptr, h);
186 
187   // If the region was extended, then there exists a previous chunk that should
188   // be linked to the new chunk.
189   if (maybe_extended_region != nullptr) {
190     ChunkHandle prev =
191         maybe_extended_region->get_handle(maybe_extended_region->ptr());
192     BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
193     // Find the last recorded chunk in the extended region.
194     while (prev_chunk->next != kInvalidChunkHandle) {
195       prev = prev_chunk->next;
196       prev_chunk = ChunkFromHandle(prev);
197     }
198     c->prev = prev;
199     prev_chunk->next = h;
200   }
201 
202   // Maybe merge adjacent chunks and insert the chunk into the right bin.
203   InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));
204 
205   return true;
206 }
207 
AllocateChunk()208 BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
209   if (free_chunks_list_ != kInvalidChunkHandle) {
210     ChunkHandle h = free_chunks_list_;
211     Chunk* c = ChunkFromHandle(h);
212     free_chunks_list_ = c->next;
213     return h;
214   } else {
215     ChunkHandle h = chunks_.size();
216     chunks_.resize(h + 1);
217     return h;
218   }
219 }
220 
DeallocateChunk(ChunkHandle h)221 void BFCAllocator::DeallocateChunk(ChunkHandle h) {
222   Chunk* c = ChunkFromHandle(h);
223   c->allocation_id = -1;
224   c->bin_num = kInvalidBinNum;
225   c->next = free_chunks_list_;
226   free_chunks_list_ = h;
227 }
228 
AllocateRawInternalWithRetry(size_t unused_alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)229 void* BFCAllocator::AllocateRawInternalWithRetry(
230     size_t unused_alignment, size_t num_bytes,
231     const AllocationAttributes& allocation_attr) {
232   // Fast path: Try once to allocate without getting the retry_helper_ involved
233   uint64 freed_by_count = 0;
234   if (allocation_attr.freed_by_func != nullptr) {
235     freed_by_count = (*allocation_attr.freed_by_func)();
236   }
237   void* r =
238       AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
239   if (r != nullptr) {
240     return r;
241   } else {
242     static const int64_t kMaxMillisToWait = 10000;  // 10 seconds
243     r = retry_helper_.AllocateRaw(
244         [this, &allocation_attr](size_t a, size_t nb, bool v) {
245           uint64 freed_by_count = 0;
246           if (allocation_attr.freed_by_func != nullptr) {
247             freed_by_count = (*allocation_attr.freed_by_func)();
248           }
249           return AllocateRawInternal(a, nb, v, freed_by_count);
250         },
251         kMaxMillisToWait, unused_alignment, num_bytes);
252     return r;
253   }
254 }
255 
AllocateRaw(size_t unused_alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)256 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
257                                 const AllocationAttributes& allocation_attr) {
258   VLOG(3) << "AllocateRaw " << Name() << "  " << num_bytes;
259   void* result = [&] {
260     if (!allocation_attr.retry_on_failure) {
261       // Return immediately upon the first failure if this is for allocating an
262       // optional scratch space.
263       bool dump_log_on_failure = VLOG_IS_ON(2);
264       uint64 freed_by_count = 0;
265       if (allocation_attr.freed_by_func != nullptr) {
266         freed_by_count = (*allocation_attr.freed_by_func)();
267       }
268       void* res = AllocateRawInternal(unused_alignment, num_bytes,
269                                       dump_log_on_failure, freed_by_count);
270       if (res == nullptr) {
271         static std::atomic<int32> log_counter{0};
272         int32 counter_value = log_counter.load(std::memory_order_relaxed);
273         if (counter_value < 10) {
274           log_counter.store(counter_value + 1, std::memory_order_relaxed);
275           LOG(WARNING)
276               << "Allocator (" << Name() << ") ran out of memory trying "
277               << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
278               << " with freed_by_count=" << freed_by_count
279               << ". The caller indicates that this is not a failure, but"
280               << " may mean that there could be performance gains if more"
281               << " memory were available.";
282         }
283       }
284       return res;
285     } else {
286       return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
287                                           allocation_attr);
288     }
289   }();
290   VLOG(3) << "AllocateRaw " << Name() << "  " << num_bytes << " " << result;
291   return result;
292 }
293 
294 // static
RoundedBytes(size_t bytes)295 size_t BFCAllocator::RoundedBytes(size_t bytes) {
296   size_t rounded_bytes =
297       (kMinAllocationSize *
298        ((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
299   DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
300   return rounded_bytes;
301 }
302 
DeallocateFreeRegions(size_t rounded_bytes)303 bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
304     TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
305   // Do nothing if garbage collection is off.
306   if (!garbage_collection_) {
307     return false;
308   }
309 
310   // Searching for free regions.
311   absl::flat_hash_set<void*> free_region_ptrs;
312   size_t total_free_bytes = 0;
313   for (const AllocationRegion& region : region_manager_.regions()) {
314     ChunkHandle h = region_manager_.get_handle(region.ptr());
315     bool any_use = false;
316     while (h != kInvalidChunkHandle) {
317       const Chunk* c = ChunkFromHandle(h);
318       if (c->in_use()) {
319         any_use = true;
320         break;
321       }
322       h = c->next;
323     }
324 
325     if (!any_use) {
326       VLOG(2) << "Found free region with ptr = " << region.ptr();
327       free_region_ptrs.insert(region.ptr());
328       total_free_bytes += region.memory_size();
329     }
330   }
331 
332   if (total_free_bytes == 0) {
333     return false;
334   }
335 
336   // Rough estimation to check whether deallocation can help.
337   size_t available_bytes =
338       memory_limit_ - total_region_allocated_bytes_ + total_free_bytes;
339   if (rounded_bytes > available_bytes) {
340     return false;
341   }
342 
343   LOG(WARNING) << "Garbage collection: deallocate free memory regions"
344                << " (i.e., allocations) so that we can re-allocate a larger"
345                << " region to avoid OOM due to memory fragmentation. If you"
346                << " see this message frequently, you are running near the"
347                << " threshold of the available device memory and re-allocation"
348                << " may incur great performance overhead. You may try smaller"
349                << " batch sizes to observe the performance impact."
350                << " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to"
351                << " disable this feature.";
352 
353   // Deallocate free regions.
354   DeallocateRegions(free_region_ptrs);
355 
356   return true;
357 }
358 
DeallocateRegions(const absl::flat_hash_set<void * > & region_ptrs)359 void BFCAllocator::DeallocateRegions(
360     const absl::flat_hash_set<void*>& region_ptrs)
361     TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
362   // Explicitly remove the const qualifier as some compilers disallow passing
363   // const_iterator to std::vector::erase(), which is used in
364   // RemoveAllocationRegion().
365   auto regions =
366       const_cast<std::vector<AllocationRegion>*>(&region_manager_.regions());
367   auto it = regions->begin();
368   while (it != regions->end()) {
369     if (!region_ptrs.contains(it->ptr())) {
370       ++it;
371       continue;
372     }
373 
374     VLOG(2) << "Deallocate region with ptr = " << it->ptr();
375     // Remove all chunk registrations from Bins.
376     ChunkHandle h = region_manager_.get_handle(it->ptr());
377     while (h != kInvalidChunkHandle) {
378       const Chunk* c = ChunkFromHandle(h);
379       if (c->bin_num != kInvalidBinNum) {
380         RemoveFreeChunkFromBin(h);
381       }
382       auto h_to_delete = h;
383       h = c->next;
384       DeleteChunk(h_to_delete);
385     }
386 
387     // Deallocate the memory.
388     sub_allocator_->Free(it->ptr(), it->memory_size());
389     total_region_allocated_bytes_ -= it->memory_size();
390     it = region_manager_.RemoveAllocationRegion(it);
391   }
392 }
393 
AllocateRawInternal(size_t unused_alignment,size_t num_bytes,bool dump_log_on_failure,uint64 freed_before)394 void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
395                                         size_t num_bytes,
396                                         bool dump_log_on_failure,
397                                         uint64 freed_before) {
398   if (num_bytes == 0) {
399     VLOG(2) << "tried to allocate 0 bytes";
400     return nullptr;
401   }
402   // First, always allocate memory of at least kMinAllocationSize
403   // bytes, and always allocate multiples of kMinAllocationSize bytes
404   // so all memory addresses are nicely byte aligned.
405   size_t rounded_bytes = RoundedBytes(num_bytes);
406 
407   // The BFC allocator tries to find the best fit first.
408   BinNum bin_num = BinNumForSize(rounded_bytes);
409 
410   mutex_lock l(lock_);
411   if (!timestamped_chunks_.empty()) {
412     // Merge timestamped chunks whose counts have become safe for general use.
413     MergeTimestampedChunks(0);
414   }
415   void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
416   if (ptr != nullptr) {
417     AddTraceMe("MemoryAllocation", ptr);
418     return ptr;
419   }
420 
421   // Try to extend
422   if (Extend(unused_alignment, rounded_bytes)) {
423     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
424     if (ptr != nullptr) {
425       AddTraceMe("MemoryAllocation", ptr);
426       return ptr;
427     }
428   }
429 
430   if ((freed_before == 0) && (!timestamped_chunks_.empty())) {
431     // We're unable to satisfy an allocation request without a specific
432     // timestamp requirement.  Rather than fail, try merging any held-out
433     // timestamped chunks more aggressively until a free chunk of the necessary
434     // size is formed.
435     if (MergeTimestampedChunks(rounded_bytes)) {
436       ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
437       if (ptr != nullptr) {
438         AddTraceMe("MemoryAllocation", ptr);
439         return ptr;
440       }
441     }
442   }
443 
444   // Reaching this point means that no chunks can satisfy the request. Also,
445   // the unallocated bytes cannot satisfy the request. Before giving up, let's
446   // try deallocating free regions so that suballocator can combine them with
447   // the unallocated bytes and form a larger region.
448   if (DeallocateFreeRegions(rounded_bytes) &&
449       Extend(unused_alignment, rounded_bytes)) {
450     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
451     if (ptr != nullptr) {
452       AddTraceMe("MemoryAllocation", ptr);
453       return ptr;
454     }
455   }
456 
457   // We searched all bins for an existing free chunk to use and
458   // couldn't find one.  This means we must have run out of memory,
459   // Dump the memory log for analysis.
460   MaybeWriteMemoryMap();
461   if (dump_log_on_failure) {
462     LOG(WARNING)
463         << "Allocator (" << Name() << ") ran out of memory trying "
464         << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
465         << " (rounded to " << rounded_bytes << ")"
466         << "requested by op "
467         << ScopedMemoryDebugAnnotation::CurrentAnnotation().pending_op_name
468         << "\nIf the cause is memory fragmentation maybe the environment "
469         << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will "
470         << "improve the situation. \nCurrent allocation summary follows."
471         << "\nCurrent allocation summary follows.";
472     DumpMemoryLog(rounded_bytes);
473     LOG(WARNING) << RenderOccupancy();
474   }
475   return nullptr;
476 }
477 
LargestFreeChunk()478 int64 BFCAllocator::LargestFreeChunk() {
479   for (int i = kNumBins - 1; i >= 0; i--) {
480     if (!BinFromIndex(i)->free_chunks.empty()) {
481       return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size;
482     }
483   }
484   return 0;
485 }
486 
GetFragmentation()487 double BFCAllocator::GetFragmentation() {
488   int64_t bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use;
489   DCHECK_GT(bytes_available, 0);
490   return static_cast<double>(bytes_available - LargestFreeChunk()) /
491          bytes_available;
492 }
493 
AddTraceMe(absl::string_view traceme_name,const void * ptr)494 void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* ptr) {
495   BFCAllocator::Chunk* chunk = ChunkFromHandle(region_manager_.get_handle(ptr));
496   AddTraceMe(traceme_name, chunk->ptr, chunk->requested_size, chunk->size);
497 }
498 
AddTraceMe(absl::string_view traceme_name,const void * chunk_ptr,int64_t req_bytes,int64_t alloc_bytes)499 void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
500                               const void* chunk_ptr, int64_t req_bytes,
501                               int64_t alloc_bytes) {
502   tensorflow::profiler::TraceMe::InstantActivity(
503       [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
504           TF_NO_THREAD_SAFETY_ANALYSIS {
505             int64_t bytes_available =
506                 memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
507             const auto& annotation =
508                 ScopedMemoryDebugAnnotation::CurrentAnnotation();
509             std::string tensor_shape;
510             if (annotation.pending_shape) {
511               tensor_shape = annotation.pending_shape->DebugString();
512             }
513             return tensorflow::profiler::TraceMeEncode(
514                 traceme_name, {{"allocator_name", name_},
515                                {"bytes_reserved", stats_.bytes_reserved},
516                                {"bytes_allocated", stats_.bytes_in_use},
517                                {"bytes_available", bytes_available},
518                                {"fragmentation", GetFragmentation()},
519                                {"peak_bytes_in_use", stats_.peak_bytes_in_use},
520                                {"requested_bytes", req_bytes},
521                                {"allocation_bytes", alloc_bytes},
522                                {"addr", reinterpret_cast<uint64>(chunk_ptr)},
523                                {"tf_op", annotation.pending_op_name},
524                                {"id", annotation.pending_step_id},
525                                {"region_type", annotation.pending_region_type},
526                                {"data_type", annotation.pending_data_type},
527                                {"shape", tensor_shape}});
528           },
529       /*level=*/profiler::TraceMeLevel::kInfo);
530 }
531 
FindChunkPtr(BinNum bin_num,size_t rounded_bytes,size_t num_bytes,uint64 freed_before)532 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
533                                  size_t num_bytes, uint64 freed_before) {
534   // First identify the first bin that could satisfy rounded_bytes.
535   for (; bin_num < kNumBins; bin_num++) {
536     // Start searching from the first bin for the smallest chunk that fits
537     // rounded_bytes.
538     Bin* b = BinFromIndex(bin_num);
539     for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
540          ++citer) {
541       const BFCAllocator::ChunkHandle h = (*citer);
542       BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
543       DCHECK(!chunk->in_use());
544       if (freed_before > 0 && freed_before < chunk->freed_at_count) {
545         continue;
546       }
547       if (chunk->size >= rounded_bytes) {
548         // We found an existing chunk that fits us that wasn't in use, so remove
549         // it from the free bin structure prior to using.
550         RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
551 
552         // If we can break the size of the chunk into two reasonably large
553         // pieces, do so.  In any case don't waste more than a threshold of
554         // kMaxInternalFragmentation bytes on padding this alloc. If this
555         // threshold is not set by the user, then use 128MB as the default
556         // threshold.
557         const int64_t kMaxInternalFragmentation =
558             (internal_fragmentation_fraction_ > 0.0)
559                 ? internal_fragmentation_fraction_ * memory_limit_
560                 : 128 << 20;
561         if (chunk->size >= rounded_bytes * 2 ||
562             static_cast<int64>(chunk->size) - rounded_bytes >=
563                 kMaxInternalFragmentation) {
564           SplitChunk(h, rounded_bytes);
565           chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
566         }
567 
568         // The requested size of the returned chunk is what the user
569         // has allocated.
570         chunk->requested_size = num_bytes;
571         // Assign a unique id and increment the id counter, marking the
572         // chunk as being in use.
573         chunk->allocation_id = next_allocation_id_++;
574 
575         // Update stats.
576         ++stats_.num_allocs;
577         stats_.bytes_in_use += chunk->size;
578         if (stats_.bytes_in_use > stats_.peak_bytes_in_use) {
579           VLOG(2) << "New Peak memory usage of " << stats_.bytes_in_use
580                   << " bytes for " << Name();
581         }
582         stats_.peak_bytes_in_use =
583             std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
584         stats_.largest_alloc_size =
585             std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
586 
587 #ifdef TENSORFLOW_MEM_DEBUG
588         if (ShouldRecordOpName()) {
589           const auto& annotation =
590               ScopedMemoryDebugAnnotation::CurrentAnnotation();
591           if (annotation.pending_op_name != nullptr) {
592             chunk->op_name = annotation.pending_op_name;
593           } else {
594             LOG(INFO) << "missing pending_op_name for " << Name()
595                       << " reading addr "
596                       << static_cast<const void*>(&annotation.pending_op_name)
597                       << "\n"
598                       << CurrentStackTrace();
599             chunk->op_name = nullptr;
600           }
601           chunk->action_count = ++action_counter_;
602           chunk->step_id = annotation.pending_step_id;
603           int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
604           size_history_[slot] = stats_.bytes_in_use;
605         }
606 #endif
607 
608         VLOG(4) << "Returning: " << chunk->ptr;
609         if (VLOG_IS_ON(4)) {
610           LOG(INFO) << "A: " << RenderOccupancy();
611         }
612         return chunk->ptr;
613       }
614     }
615   }
616 
617   return nullptr;
618 }
619 
SplitChunk(BFCAllocator::ChunkHandle h,size_t num_bytes)620 void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
621   // Allocate the new chunk before we do any ChunkFromHandle
622   ChunkHandle h_new_chunk = AllocateChunk();
623 
624   Chunk* c = ChunkFromHandle(h);
625   CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
626 
627   // Create a new chunk starting num_bytes after c
628   BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
629   new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
630   region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
631 
632   // Set the new sizes of the chunks.
633   new_chunk->size = c->size - num_bytes;
634   c->size = num_bytes;
635 
636   // The new chunk is not in use.
637   new_chunk->allocation_id = -1;
638 
639   // It inherits the freed time.
640   new_chunk->freed_at_count = c->freed_at_count;
641 
642   // Maintain the pointers.
643   // c <-> c_neighbor becomes
644   // c <-> new_chunk <-> c_neighbor
645   BFCAllocator::ChunkHandle h_neighbor = c->next;
646   new_chunk->prev = h;
647   new_chunk->next = h_neighbor;
648   c->next = h_new_chunk;
649   if (h_neighbor != kInvalidChunkHandle) {
650     Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
651     c_neighbor->prev = h_new_chunk;
652   }
653 
654   // Add the newly free chunk to the free bin.
655   InsertFreeChunkIntoBin(h_new_chunk);
656 }
657 
DeallocateRaw(void * ptr)658 void BFCAllocator::DeallocateRaw(void* ptr) {
659   VLOG(3) << "DeallocateRaw " << Name() << " "
660           << (ptr ? RequestedSize(ptr) : 0);
661   DeallocateRawInternal(ptr);
662   retry_helper_.NotifyDealloc();
663 }
664 
DeallocateRawInternal(void * ptr)665 void BFCAllocator::DeallocateRawInternal(void* ptr) {
666   if (ptr == nullptr) {
667     VLOG(2) << "tried to deallocate nullptr";
668     return;
669   }
670   mutex_lock l(lock_);
671 
672   // Find the chunk from the ptr.
673   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
674   CHECK(h != kInvalidChunkHandle);
675   // Record chunk information before it's freed.
676   Chunk* chunk = ChunkFromHandle(h);
677   void* chunk_ptr = chunk->ptr;
678   int64_t req_bytes = chunk->requested_size;
679   int64_t alloc_bytes = chunk->size;
680 
681   MarkFree(h);
682 
683   // Consider coalescing it.
684   if (timing_counter_) {
685     InsertFreeChunkIntoBin(h);
686     timestamped_chunks_.push_back(h);
687   } else {
688     InsertFreeChunkIntoBin(TryToCoalesce(h, false));
689   }
690 
691   // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for
692   // correct aggregation stats (bytes_in_use, fragmentation).
693   AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes);
694 
695   if (VLOG_IS_ON(4)) {
696     LOG(INFO) << "F: " << RenderOccupancy();
697   }
698 }
699 
700 // Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
701 // We merge Chunk(h2) into Chunk(h1).
Merge(BFCAllocator::ChunkHandle h1,BFCAllocator::ChunkHandle h2)702 void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
703                          BFCAllocator::ChunkHandle h2) {
704   Chunk* c1 = ChunkFromHandle(h1);
705   Chunk* c2 = ChunkFromHandle(h2);
706   // We can only merge chunks that are not in use.
707   CHECK(!c1->in_use() && !c2->in_use());
708 
709   // c1's prev doesn't change, still points to the same ptr, and is
710   // still not in use.
711 
712   // Fix up neighbor pointers
713   //
714   // c1 <-> c2 <-> c3 should become
715   // c1 <-> c3
716 
717   BFCAllocator::ChunkHandle h3 = c2->next;
718   c1->next = h3;
719   CHECK(c2->prev == h1);
720   if (h3 != kInvalidChunkHandle) {
721     BFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
722     c3->prev = h1;
723   }
724 
725   // Set the new size
726   c1->size += c2->size;
727 
728   // Pick latest free time.
729   c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count);
730 
731   DeleteChunk(h2);
732 }
733 
DeleteChunk(ChunkHandle h)734 void BFCAllocator::DeleteChunk(ChunkHandle h) {
735   // Delete h and cleanup all state
736   Chunk* c = ChunkFromHandle(h);
737   //  VLOG(4) << "Removing: " << c->ptr;
738   region_manager_.erase(c->ptr);
739   DeallocateChunk(h);
740 }
741 
InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h)742 void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
743   Chunk* c = ChunkFromHandle(h);
744   CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
745   BinNum bin_num = BinNumForSize(c->size);
746   Bin* new_bin = BinFromIndex(bin_num);
747   c->bin_num = bin_num;
748   new_bin->free_chunks.insert(h);
749 }
750 
RemoveFreeChunkIterFromBin(BFCAllocator::Bin::FreeChunkSet * free_chunks,const BFCAllocator::Bin::FreeChunkSet::iterator & citer)751 void BFCAllocator::RemoveFreeChunkIterFromBin(
752     BFCAllocator::Bin::FreeChunkSet* free_chunks,
753     const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
754   ChunkHandle h = *citer;
755   Chunk* c = ChunkFromHandle(h);
756   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
757   free_chunks->erase(citer);
758   c->bin_num = kInvalidBinNum;
759 }
760 
RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h)761 void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
762   Chunk* c = ChunkFromHandle(h);
763   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
764   CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), 0)
765       << "Could not find chunk in bin";
766   c->bin_num = kInvalidBinNum;
767 }
768 
MarkFree(BFCAllocator::ChunkHandle h)769 void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
770   Chunk* c = ChunkFromHandle(h);
771   CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
772 
773   // Mark the chunk as no longer in use.
774   c->allocation_id = -1;
775 
776   // Optionally record the free time.
777   if (timing_counter_) {
778     c->freed_at_count = timing_counter_->next();
779   }
780 
781   // Updates the stats.
782   stats_.bytes_in_use -= c->size;
783 
784 #ifdef TENSORFLOW_MEM_DEBUG
785   if (ShouldRecordOpName()) {
786     c->action_count = ++action_counter_;
787     int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
788     size_history_[slot] = stats_.bytes_in_use;
789   }
790 #endif
791 }
792 
TryToCoalesce(ChunkHandle h,bool ignore_freed_at)793 BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
794                                                       bool ignore_freed_at) {
795   Chunk* c = ChunkFromHandle(h);
796   if ((!ignore_freed_at) && c->freed_at_count > 0) return h;
797   ChunkHandle coalesced_chunk = h;
798 
799   // If the next chunk is free, merge it into c and delete it.
800   if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) {
801     Chunk* n = ChunkFromHandle(c->next);
802     if ((n->freed_at_count == 0) || ignore_freed_at) {
803       VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr;
804       RemoveFreeChunkFromBin(c->next);
805       Merge(h, c->next);
806     }
807   }
808 
809   // If the previous chunk is free, merge c into it and delete c.
810   if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) {
811     Chunk* n = ChunkFromHandle(c->prev);
812     if ((n->freed_at_count == 0) || ignore_freed_at) {
813       VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr;
814       coalesced_chunk = c->prev;
815       RemoveFreeChunkFromBin(c->prev);
816       Merge(c->prev, h);
817     }
818   }
819 
820   return coalesced_chunk;
821 }
822 
SetSafeFrontier(uint64 count)823 void BFCAllocator::SetSafeFrontier(uint64 count) {
824   uint64 current = safe_frontier_.load(std::memory_order_relaxed);
825   while (count > current) {
826     if (safe_frontier_.compare_exchange_strong(current, count)) {
827       retry_helper_.NotifyDealloc();
828       return;
829     } else {
830       current = safe_frontier_.load(std::memory_order_relaxed);
831     }
832   }
833 }
834 
MergeTimestampedChunks(size_t required_bytes)835 bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
836   VLOG(1) << "MergeTimestampedChunks queue_len=" << timestamped_chunks_.size()
837           << " required_bytes=" << required_bytes;
838   bool satisfied = (required_bytes == 0);
839   std::vector<void*> to_merge;
840   std::deque<ChunkHandle> new_ts_queue;
841   while (!timestamped_chunks_.empty()) {
842     ChunkHandle h = timestamped_chunks_.front();
843     timestamped_chunks_.pop_front();
844     DCHECK_NE(h, kInvalidChunkHandle);
845     Chunk* c = ChunkFromHandle(h);
846     // It's possible this chunk has already been merged so refetch and retest
847     // the handle.
848     h = region_manager_.get_handle(c->ptr);
849     if (h == kInvalidChunkHandle) {
850       continue;
851     }
852     if (c->in_use() || (c->bin_num == kInvalidBinNum)) {
853       // This chunk has already been reallocated.
854       continue;
855     }
856     if (c->freed_at_count == 0) {
857       to_merge.push_back(c->ptr);
858       continue;
859     }
860     // Chunk should be free and assigned to a bin.
861     DCHECK_NE(c->bin_num, kInvalidBinNum);
862     if (c->freed_at_count < safe_frontier_) {
863       c->freed_at_count = 0;
864       to_merge.push_back(c->ptr);
865     } else if (required_bytes > 0) {
866       to_merge.push_back(c->ptr);
867     } else {
868       new_ts_queue.push_back(h);
869     }
870   }
871   DCHECK(timestamped_chunks_.empty());
872   std::swap(timestamped_chunks_, new_ts_queue);
873 
874   // At this point all candidate chunks have been moved from timestamped_chunks_
875   // to to_merge.  If this is a standard merge (required_bytes == 0) then
876   // merge them all, otherwise merge just until a Chunk of the required size
877   // is produced.
878   for (int ci = 0, end = to_merge.size(); ci < end; ++ci) {
879     void* ptr = to_merge[ci];
880     // It's possible that the Chunk associated with this memory location got
881     // merged and deallocated in a prior iteration so refetch the handle and
882     // retest.
883     ChunkHandle h = region_manager_.get_handle(ptr);
884     if (h == kInvalidChunkHandle) continue;
885     if (required_bytes == 0 || !satisfied) {
886       Chunk* c = ChunkFromHandle(h);
887       DCHECK_NE(c->bin_num, kInvalidBinNum);
888       DCHECK(!c->in_use());
889       RemoveFreeChunkFromBin(h);
890       ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0));
891       InsertFreeChunkIntoBin(new_h);
892       if (required_bytes > 0) {
893         c = ChunkFromHandle(new_h);
894         if (new_h != h && c->freed_at_count > 0) {
895           timestamped_chunks_.push_back(new_h);
896         }
897         if (c->size >= required_bytes) {
898           satisfied = true;
899         }
900       }
901     } else {
902       // We were force merging Chunks with unsafe timestamps, but managed
903       // to create a satisfying Chunk so just requeue the rest.
904       timestamped_chunks_.push_back(h);
905     }
906   }
907   return satisfied;
908 }
909 
TracksAllocationSizes() const910 bool BFCAllocator::TracksAllocationSizes() const { return true; }
911 
RequestedSize(const void * ptr) const912 size_t BFCAllocator::RequestedSize(const void* ptr) const {
913   CHECK(ptr);
914   mutex_lock l(lock_);
915   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
916   CHECK(h != kInvalidChunkHandle)
917       << "Asked for requested size of pointer we never allocated: " << ptr;
918   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
919   return c->requested_size;
920 }
921 
AllocatedSize(const void * ptr) const922 size_t BFCAllocator::AllocatedSize(const void* ptr) const {
923   mutex_lock l(lock_);
924   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
925   CHECK(h != kInvalidChunkHandle)
926       << "Asked for allocated size of pointer we never allocated: " << ptr;
927   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
928   return c->size;
929 }
930 
AllocationId(const void * ptr) const931 int64 BFCAllocator::AllocationId(const void* ptr) const {
932   mutex_lock l(lock_);
933   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
934   CHECK(h != kInvalidChunkHandle)
935       << "Asked for allocation id of pointer we never allocated: " << ptr;
936   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
937   return c->allocation_id;
938 }
939 
940 namespace {
941 
RenderRegion(char * rendered,const size_t resolution,const size_t total_render_size,const size_t offset,const void * base_ptr,const void * ptr,const size_t size,const char c)942 void RenderRegion(char* rendered, const size_t resolution,
943                   const size_t total_render_size, const size_t offset,
944                   const void* base_ptr, const void* ptr, const size_t size,
945                   const char c) {
946   const char* base_ptr_c = static_cast<const char*>(base_ptr);
947   const char* ptr_c = static_cast<const char*>(ptr);
948 
949   size_t start_location =
950       ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
951   CHECK_GE(start_location, 0);
952   CHECK_LT(start_location, resolution);
953   size_t end_location =
954       ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
955       total_render_size;
956   CHECK_GE(end_location, 0);
957   CHECK_LT(end_location, resolution);
958 
959   for (size_t i = start_location; i <= end_location; ++i) {
960     rendered[i] = c;
961   }
962 }
963 
964 }  // namespace
965 
RenderOccupancy()966 string BFCAllocator::RenderOccupancy() {
967   // Make a buffer for the ASCII-art representation.
968   const size_t resolution = 100;
969   char rendered[resolution];
970 
971   // Compute the total region size to render over
972   size_t total_region_size = 0;
973   for (const auto& region : region_manager_.regions()) {
974     total_region_size += region.memory_size();
975   }
976 
977   if (total_region_size == 0) {
978     return "<allocator contains no memory>";
979   }
980 
981   // Start out with everything empty
982   RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
983                total_region_size, '_');
984 
985   size_t region_offset = 0;
986   for (const auto& region : region_manager_.regions()) {
987     ChunkHandle h = region_manager_.get_handle(region.ptr());
988     // Then render each chunk left to right.
989     while (h != kInvalidChunkHandle) {
990       Chunk* c = ChunkFromHandle(h);
991       if (c->in_use()) {
992         // Render the wasted space
993         size_t wasted = c->size - c->requested_size;
994         if (wasted > 0) {
995           RenderRegion(rendered, resolution, total_region_size,
996                        region_offset + c->requested_size, region.ptr(), c->ptr,
997                        wasted, 'x');
998         }
999         // Then the occupied space
1000         RenderRegion(rendered, resolution, total_region_size, region_offset,
1001                      region.ptr(), c->ptr, c->requested_size, '*');
1002       }
1003       h = c->next;
1004     }
1005     region_offset += region.memory_size();
1006   }
1007 
1008   return string(rendered, resolution);
1009 }
1010 
DumpMemoryLog(size_t num_bytes)1011 void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
1012   const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1013   LOG(INFO) << "BFCAllocator dump for " << Name();
1014   for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
1015     Bin* b = BinFromIndex(bin_num);
1016     const BinDebugInfo& bin_info = bin_infos[bin_num];
1017     CHECK_EQ(b->free_chunks.size(),
1018              bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1019 
1020     LOG(INFO) << "Bin (" << b->bin_size
1021               << "): \tTotal Chunks: " << bin_info.total_chunks_in_bin
1022               << ", Chunks in use: " << bin_info.total_chunks_in_use << ". "
1023               << strings::HumanReadableNumBytes(bin_info.total_bytes_in_bin)
1024               << " allocated for chunks. "
1025               << strings::HumanReadableNumBytes(bin_info.total_bytes_in_use)
1026               << " in use in bin. "
1027               << strings::HumanReadableNumBytes(
1028                      bin_info.total_requested_bytes_in_use)
1029               << " client-requested in use in bin.";
1030   }
1031 
1032   // Find the bin that we would have liked to allocate in, so we
1033   // can get some further analysis about fragmentation.
1034   Bin* b = BinForSize(num_bytes);
1035 
1036   LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
1037             << " was " << strings::HumanReadableNumBytes(b->bin_size)
1038             << ", Chunk State: ";
1039 
1040   for (ChunkHandle h : b->free_chunks) {
1041     Chunk* c = ChunkFromHandle(h);
1042     LOG(INFO) << c->DebugString(this, true);
1043   }
1044 
1045   // Next show the chunks that are in use, and also summarize their
1046   // number by size.
1047   std::map<size_t, int> in_use_by_size;
1048   for (const auto& region : region_manager_.regions()) {
1049     LOG(INFO) << "Next region of size " << region.memory_size();
1050     ChunkHandle h = region_manager_.get_handle(region.ptr());
1051     while (h != kInvalidChunkHandle) {
1052       const Chunk* c = ChunkFromHandle(h);
1053       if (c->in_use()) {
1054         in_use_by_size[c->size]++;
1055       }
1056       string buf = strings::StrCat(
1057           (c->in_use() ? "InUse" : "Free "), " at ",
1058           strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
1059 #ifdef TENSORFLOW_MEM_DEBUG
1060       if (ShouldRecordOpName()) {
1061         strings::StrAppend(&buf, " by op ", c->op_name, " action_count ",
1062                            c->action_count, " step ", c->step_id);
1063       }
1064 #endif
1065       strings::StrAppend(&buf, " next ", c->next);
1066       if (timing_counter_) {
1067         strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
1068       }
1069       LOG(INFO) << buf;
1070       h = c->next;
1071     }
1072   }
1073 
1074   LOG(INFO) << "     Summary of in-use Chunks by size: ";
1075   size_t total_bytes = 0;
1076   for (auto& it : in_use_by_size) {
1077     LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
1078               << strings::HumanReadableNumBytes(it.first * it.second);
1079     total_bytes += (it.first * it.second);
1080   }
1081   LOG(INFO) << "Sum Total of in-use chunks: "
1082             << strings::HumanReadableNumBytes(total_bytes);
1083   LOG(INFO) << "total_region_allocated_bytes_: "
1084             << total_region_allocated_bytes_
1085             << " memory_limit_: " << memory_limit_ << " available bytes: "
1086             << (memory_limit_ - total_region_allocated_bytes_)
1087             << " curr_region_allocation_bytes_: "
1088             << curr_region_allocation_bytes_;
1089   LOG(INFO) << "Stats: \n" << stats_.DebugString();
1090 }
1091 
MaybeWriteMemoryMap()1092 void BFCAllocator::MaybeWriteMemoryMap() {
1093   const char* gpu_memory_map_file = std::getenv("TF_BFC_MEMORY_DUMP");
1094   if (gpu_memory_map_file != nullptr) {
1095     std::unique_ptr<WritableFile> dump_file;
1096     string file_name = strings::StrCat(gpu_memory_map_file, "_", Name(), ".",
1097                                        Env::Default()->NowMicros());
1098     Status status = Env::Default()->NewWritableFile(file_name, &dump_file);
1099     if (!status.ok()) {
1100       LOG(ERROR) << "Failed to open file " << file_name;
1101       return;
1102     }
1103     MemoryDump md = RecordMemoryMapInternal();
1104     status = dump_file->Append(md.SerializeAsString());
1105     if (!status.ok()) {
1106       LOG(ERROR) << "Error on writing to file " << gpu_memory_map_file << ": "
1107                  << status;
1108     }
1109   }
1110 }
1111 
RecordMemoryMap()1112 MemoryDump BFCAllocator::RecordMemoryMap() {
1113   mutex_lock l(lock_);
1114   return RecordMemoryMapInternal();
1115 }
1116 
RecordMemoryMapInternal()1117 MemoryDump BFCAllocator::RecordMemoryMapInternal() {
1118   MemoryDump md;
1119   md.set_allocator_name(Name());
1120 
1121   // Record the general stats
1122   MemAllocatorStats* mas = md.mutable_stats();
1123   mas->set_num_allocs(stats_.num_allocs);
1124   mas->set_bytes_in_use(stats_.bytes_in_use);
1125   mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
1126   mas->set_largest_alloc_size(stats_.largest_alloc_size);
1127 
1128   // Record summary data for every bin.
1129   const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1130   for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
1131     Bin* b = BinFromIndex(bin_num);
1132     const BinDebugInfo& bin_info = bin_infos[bin_num];
1133     DCHECK_EQ(b->free_chunks.size(),
1134               bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1135     BinSummary* bs = md.add_bin_summary();
1136     bs->set_bin(bin_num);
1137     bs->set_total_bytes_in_use(bin_info.total_bytes_in_use);
1138     bs->set_total_bytes_in_bin(bin_info.total_bytes_in_bin);
1139     bs->set_total_chunks_in_use(bin_info.total_chunks_in_use);
1140     bs->set_total_chunks_in_bin(bin_info.total_chunks_in_bin);
1141   }
1142 
1143   // Record state of every defined Chunk.
1144   for (const auto& region : region_manager_.regions()) {
1145     ChunkHandle h = region_manager_.get_handle(region.ptr());
1146     while (h != kInvalidChunkHandle) {
1147       const Chunk* c = ChunkFromHandle(h);
1148       MemChunk* mc = md.add_chunk();
1149       mc->set_in_use(c->in_use());
1150       mc->set_address(reinterpret_cast<uint64>(c->ptr));
1151       mc->set_size(c->size);
1152       mc->set_requested_size(c->requested_size);
1153       mc->set_bin(c->bin_num);
1154 #ifdef TENSORFLOW_MEM_DEBUG
1155       mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN");
1156       mc->set_step_id(c->step_id);
1157       mc->set_action_count(c->action_count);
1158 #endif
1159       if (timing_counter_) {
1160         mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
1161       }
1162       h = c->next;
1163     }
1164   }
1165 
1166   mas->set_fragmentation_metric(GetFragmentation());
1167 
1168 #ifdef TENSORFLOW_MEM_DEBUG
1169   // Record the recent size history
1170   int history_len = static_cast<int>(std::min(
1171       action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
1172   for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
1173     SnapShot* ss = md.add_snap_shot();
1174     ss->set_action_count(i);
1175     int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
1176     ss->set_size(size_history_[slot]);
1177   }
1178 #endif
1179 
1180   return md;
1181 }
1182 
GetStats()1183 absl::optional<AllocatorStats> BFCAllocator::GetStats() {
1184   mutex_lock l(lock_);
1185   return stats_;
1186 }
1187 
ClearStats()1188 bool BFCAllocator::ClearStats() {
1189   mutex_lock l(lock_);
1190   stats_.num_allocs = 0;
1191   stats_.peak_bytes_in_use = stats_.bytes_in_use;
1192   stats_.largest_alloc_size = 0;
1193   return true;
1194 }
1195 
1196 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
get_bin_debug_info()1197 BFCAllocator::get_bin_debug_info() {
1198   std::array<BinDebugInfo, kNumBins> bin_infos;
1199   for (const auto& region : region_manager_.regions()) {
1200     ChunkHandle h = region_manager_.get_handle(region.ptr());
1201     while (h != kInvalidChunkHandle) {
1202       const Chunk* c = ChunkFromHandle(h);
1203       BinNum bin_num = BinNumForSize(c->size);
1204       BinDebugInfo& bin_info = bin_infos[bin_num];
1205       bin_info.total_bytes_in_bin += c->size;
1206       bin_info.total_chunks_in_bin++;
1207       if (c->in_use()) {
1208         bin_info.total_bytes_in_use += c->size;
1209         bin_info.total_requested_bytes_in_use += c->requested_size;
1210         bin_info.total_chunks_in_use++;
1211       } else {
1212         Bin* bin = BinFromIndex(bin_num);
1213         CHECK_EQ(bin->free_chunks.count(h), 1);
1214         CHECK_EQ(c->bin_num, bin_num);
1215       }
1216       h = c->next;
1217     }
1218   }
1219   return bin_infos;
1220 }
1221 
1222 }  // namespace tensorflow
1223