1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/common_runtime/bfc_allocator.h"
17
18 #include <atomic>
19
20 #include "absl/strings/string_view.h"
21 #include "tensorflow/core/common_runtime/allocator_retry.h"
22 #include "tensorflow/core/lib/core/bits.h"
23 #include "tensorflow/core/lib/strings/numbers.h"
24 #include "tensorflow/core/lib/strings/str_util.h"
25 #include "tensorflow/core/lib/strings/strcat.h"
26 #include "tensorflow/core/platform/file_system.h"
27 #include "tensorflow/core/platform/logging.h"
28 #include "tensorflow/core/platform/mutex.h"
29 #ifdef TENSORFLOW_MEM_DEBUG
30 #include "tensorflow/core/platform/stacktrace.h"
31 #endif
32 #include "tensorflow/core/framework/tensor_shape.h"
33 #include "tensorflow/core/platform/types.h"
34 #include "tensorflow/core/profiler/lib/traceme.h"
35 #include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
36
37 namespace tensorflow {
38
39 constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle;
40
BFCAllocator(SubAllocator * sub_allocator,size_t total_memory,bool allow_growth,const string & name,bool garbage_collection)41 BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
42 bool allow_growth, const string& name,
43 bool garbage_collection)
44 : garbage_collection_(garbage_collection),
45 coalesce_regions_(sub_allocator->SupportsCoalescing()),
46 sub_allocator_(sub_allocator),
47 name_(name),
48 free_chunks_list_(kInvalidChunkHandle),
49 next_allocation_id_(1) {
50 if (allow_growth) {
51 // 2MiB smallest initial allocation, unless total memory available
52 // is less.
53 curr_region_allocation_bytes_ =
54 RoundedBytes(std::min(total_memory, size_t{2 << 20}));
55 } else {
56 curr_region_allocation_bytes_ = RoundedBytes(total_memory);
57 }
58
59 // Allocate the requested amount of memory.
60 memory_limit_ = total_memory;
61 stats_.bytes_limit = static_cast<int64>(total_memory);
62
63 // Create a bunch of bins of various good sizes.
64
65 // We create bins to fit all possible ranges that cover the
66 // memory_limit_ starting from allocations up to 256 bytes to
67 // allocations up to (and including) the memory limit.
68 VLOG(1) << "Creating new BFCAllocator named: " << name;
69 for (BinNum b = 0; b < kNumBins; b++) {
70 size_t bin_size = BinNumToSize(b);
71 VLOG(1) << "Creating bin of max chunk size "
72 << strings::HumanReadableNumBytes(bin_size);
73 new (BinFromIndex(b)) Bin(this, bin_size);
74 CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
75 CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
76 CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
77 if (b + 1 < kNumBins) {
78 CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
79 }
80 }
81 }
82
~BFCAllocator()83 BFCAllocator::~BFCAllocator() {
84 // Return memory back.
85 VLOG(2) << "Number of regions allocated: "
86 << region_manager_.regions().size();
87 for (const auto& region : region_manager_.regions()) {
88 sub_allocator_->Free(region.ptr(), region.memory_size());
89 }
90
91 for (BinNum b = 0; b < kNumBins; b++) {
92 BinFromIndex(b)->~Bin();
93 }
94 }
95
ChunkFromHandle(ChunkHandle h)96 BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
97 DCHECK_GE(h, 0);
98 DCHECK_LT(h, static_cast<int>(chunks_.size()));
99 return &(chunks_[h]);
100 }
101
ChunkFromHandle(ChunkHandle h) const102 const BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) const {
103 DCHECK_GE(h, 0);
104 DCHECK_LT(h, static_cast<int>(chunks_.size()));
105 return &(chunks_[h]);
106 }
107
Extend(size_t alignment,size_t rounded_bytes)108 bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
109 size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
110 // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
111 available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
112
113 // Do we have enough space to handle the client's request?
114 // If not, fail immediately.
115 if (rounded_bytes > available_bytes) {
116 return false;
117 }
118
119 // If curr_region_allocation_bytes_ is not enough to satisfy the
120 // allocation, keep multiplying by a power of two until that is
121 // sufficient.
122 bool increased_allocation = false;
123 while (rounded_bytes > curr_region_allocation_bytes_) {
124 curr_region_allocation_bytes_ *= 2;
125 increased_allocation = true;
126 }
127
128 // Try allocating.
129 size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
130 size_t bytes_received;
131 void* mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
132 if (mem_addr == nullptr && !started_backpedal_) {
133 // Only backpedal once.
134 started_backpedal_ = true;
135
136 static constexpr float kBackpedalFactor = 0.9;
137
138 // Try allocating less memory.
139 while (mem_addr == nullptr) {
140 bytes = RoundedBytes(bytes * kBackpedalFactor);
141 if (bytes < rounded_bytes) break;
142 mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
143 }
144 }
145
146 if (mem_addr == nullptr) {
147 return false;
148 }
149
150 if (!increased_allocation) {
151 // Increase the region size of the next required allocation.
152 curr_region_allocation_bytes_ *= 2;
153 }
154
155 VLOG(1) << "Extending allocation by "
156 << strings::HumanReadableNumBytes(bytes_received) << " bytes for "
157 << Name() << ".";
158
159 total_region_allocated_bytes_ += bytes_received;
160 VLOG(1) << "Total allocated bytes: "
161 << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
162
163 VLOG(1) << "Allocated memory at " << mem_addr << " to "
164 << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);
165
166 AllocationRegion* maybe_extended_region = nullptr;
167 if (coalesce_regions_) {
168 maybe_extended_region =
169 region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
170 } else {
171 region_manager_.AddAllocationRegion(mem_addr, bytes_received);
172 }
173
174 // Create one large chunk for the whole memory space that will
175 // be chunked later.
176 ChunkHandle h = AllocateChunk();
177 BFCAllocator::Chunk* c = ChunkFromHandle(h);
178 c->ptr = mem_addr;
179 c->size = bytes_received;
180 c->allocation_id = -1;
181 c->prev = kInvalidChunkHandle;
182 c->next = kInvalidChunkHandle;
183 c->freed_at_count = 0;
184
185 region_manager_.set_handle(c->ptr, h);
186
187 // If the region was extended, then there exists a previous chunk that should
188 // be linked to the new chunk.
189 if (maybe_extended_region != nullptr) {
190 ChunkHandle prev =
191 maybe_extended_region->get_handle(maybe_extended_region->ptr());
192 BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
193 // Find the last recorded chunk in the extended region.
194 while (prev_chunk->next != kInvalidChunkHandle) {
195 prev = prev_chunk->next;
196 prev_chunk = ChunkFromHandle(prev);
197 }
198 c->prev = prev;
199 prev_chunk->next = h;
200 }
201
202 // Maybe merge adjacent chunks and insert the chunk into the right bin.
203 InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));
204
205 return true;
206 }
207
AllocateChunk()208 BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
209 if (free_chunks_list_ != kInvalidChunkHandle) {
210 ChunkHandle h = free_chunks_list_;
211 Chunk* c = ChunkFromHandle(h);
212 free_chunks_list_ = c->next;
213 return h;
214 } else {
215 ChunkHandle h = chunks_.size();
216 chunks_.resize(h + 1);
217 return h;
218 }
219 }
220
DeallocateChunk(ChunkHandle h)221 void BFCAllocator::DeallocateChunk(ChunkHandle h) {
222 Chunk* c = ChunkFromHandle(h);
223 c->allocation_id = -1;
224 c->bin_num = kInvalidBinNum;
225 c->next = free_chunks_list_;
226 free_chunks_list_ = h;
227 }
228
AllocateRawInternalWithRetry(size_t unused_alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)229 void* BFCAllocator::AllocateRawInternalWithRetry(
230 size_t unused_alignment, size_t num_bytes,
231 const AllocationAttributes& allocation_attr) {
232 // Fast path: Try once to allocate without getting the retry_helper_ involved
233 uint64 freed_by_count = 0;
234 if (allocation_attr.freed_by_func != nullptr) {
235 freed_by_count = (*allocation_attr.freed_by_func)();
236 }
237 void* r =
238 AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
239 if (r != nullptr) {
240 return r;
241 } else {
242 static const int64_t kMaxMillisToWait = 10000; // 10 seconds
243 r = retry_helper_.AllocateRaw(
244 [this, &allocation_attr](size_t a, size_t nb, bool v) {
245 uint64 freed_by_count = 0;
246 if (allocation_attr.freed_by_func != nullptr) {
247 freed_by_count = (*allocation_attr.freed_by_func)();
248 }
249 return AllocateRawInternal(a, nb, v, freed_by_count);
250 },
251 kMaxMillisToWait, unused_alignment, num_bytes);
252 return r;
253 }
254 }
255
AllocateRaw(size_t unused_alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)256 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
257 const AllocationAttributes& allocation_attr) {
258 VLOG(3) << "AllocateRaw " << Name() << " " << num_bytes;
259 void* result = [&] {
260 if (!allocation_attr.retry_on_failure) {
261 // Return immediately upon the first failure if this is for allocating an
262 // optional scratch space.
263 bool dump_log_on_failure = VLOG_IS_ON(2);
264 uint64 freed_by_count = 0;
265 if (allocation_attr.freed_by_func != nullptr) {
266 freed_by_count = (*allocation_attr.freed_by_func)();
267 }
268 void* res = AllocateRawInternal(unused_alignment, num_bytes,
269 dump_log_on_failure, freed_by_count);
270 if (res == nullptr) {
271 static std::atomic<int32> log_counter{0};
272 int32 counter_value = log_counter.load(std::memory_order_relaxed);
273 if (counter_value < 10) {
274 log_counter.store(counter_value + 1, std::memory_order_relaxed);
275 LOG(WARNING)
276 << "Allocator (" << Name() << ") ran out of memory trying "
277 << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
278 << " with freed_by_count=" << freed_by_count
279 << ". The caller indicates that this is not a failure, but"
280 << " may mean that there could be performance gains if more"
281 << " memory were available.";
282 }
283 }
284 return res;
285 } else {
286 return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
287 allocation_attr);
288 }
289 }();
290 VLOG(3) << "AllocateRaw " << Name() << " " << num_bytes << " " << result;
291 return result;
292 }
293
294 // static
RoundedBytes(size_t bytes)295 size_t BFCAllocator::RoundedBytes(size_t bytes) {
296 size_t rounded_bytes =
297 (kMinAllocationSize *
298 ((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
299 DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
300 return rounded_bytes;
301 }
302
DeallocateFreeRegions(size_t rounded_bytes)303 bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
304 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
305 // Do nothing if garbage collection is off.
306 if (!garbage_collection_) {
307 return false;
308 }
309
310 // Searching for free regions.
311 absl::flat_hash_set<void*> free_region_ptrs;
312 size_t total_free_bytes = 0;
313 for (const AllocationRegion& region : region_manager_.regions()) {
314 ChunkHandle h = region_manager_.get_handle(region.ptr());
315 bool any_use = false;
316 while (h != kInvalidChunkHandle) {
317 const Chunk* c = ChunkFromHandle(h);
318 if (c->in_use()) {
319 any_use = true;
320 break;
321 }
322 h = c->next;
323 }
324
325 if (!any_use) {
326 VLOG(2) << "Found free region with ptr = " << region.ptr();
327 free_region_ptrs.insert(region.ptr());
328 total_free_bytes += region.memory_size();
329 }
330 }
331
332 if (total_free_bytes == 0) {
333 return false;
334 }
335
336 // Rough estimation to check whether deallocation can help.
337 size_t available_bytes =
338 memory_limit_ - total_region_allocated_bytes_ + total_free_bytes;
339 if (rounded_bytes > available_bytes) {
340 return false;
341 }
342
343 LOG(WARNING) << "Garbage collection: deallocate free memory regions"
344 << " (i.e., allocations) so that we can re-allocate a larger"
345 << " region to avoid OOM due to memory fragmentation. If you"
346 << " see this message frequently, you are running near the"
347 << " threshold of the available device memory and re-allocation"
348 << " may incur great performance overhead. You may try smaller"
349 << " batch sizes to observe the performance impact."
350 << " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to"
351 << " disable this feature.";
352
353 // Deallocate free regions.
354 DeallocateRegions(free_region_ptrs);
355
356 return true;
357 }
358
DeallocateRegions(const absl::flat_hash_set<void * > & region_ptrs)359 void BFCAllocator::DeallocateRegions(
360 const absl::flat_hash_set<void*>& region_ptrs)
361 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
362 // Explicitly remove the const qualifier as some compilers disallow passing
363 // const_iterator to std::vector::erase(), which is used in
364 // RemoveAllocationRegion().
365 auto regions =
366 const_cast<std::vector<AllocationRegion>*>(®ion_manager_.regions());
367 auto it = regions->begin();
368 while (it != regions->end()) {
369 if (!region_ptrs.contains(it->ptr())) {
370 ++it;
371 continue;
372 }
373
374 VLOG(2) << "Deallocate region with ptr = " << it->ptr();
375 // Remove all chunk registrations from Bins.
376 ChunkHandle h = region_manager_.get_handle(it->ptr());
377 while (h != kInvalidChunkHandle) {
378 const Chunk* c = ChunkFromHandle(h);
379 if (c->bin_num != kInvalidBinNum) {
380 RemoveFreeChunkFromBin(h);
381 }
382 auto h_to_delete = h;
383 h = c->next;
384 DeleteChunk(h_to_delete);
385 }
386
387 // Deallocate the memory.
388 sub_allocator_->Free(it->ptr(), it->memory_size());
389 total_region_allocated_bytes_ -= it->memory_size();
390 it = region_manager_.RemoveAllocationRegion(it);
391 }
392 }
393
AllocateRawInternal(size_t unused_alignment,size_t num_bytes,bool dump_log_on_failure,uint64 freed_before)394 void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
395 size_t num_bytes,
396 bool dump_log_on_failure,
397 uint64 freed_before) {
398 if (num_bytes == 0) {
399 VLOG(2) << "tried to allocate 0 bytes";
400 return nullptr;
401 }
402 // First, always allocate memory of at least kMinAllocationSize
403 // bytes, and always allocate multiples of kMinAllocationSize bytes
404 // so all memory addresses are nicely byte aligned.
405 size_t rounded_bytes = RoundedBytes(num_bytes);
406
407 // The BFC allocator tries to find the best fit first.
408 BinNum bin_num = BinNumForSize(rounded_bytes);
409
410 mutex_lock l(lock_);
411 if (!timestamped_chunks_.empty()) {
412 // Merge timestamped chunks whose counts have become safe for general use.
413 MergeTimestampedChunks(0);
414 }
415 void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
416 if (ptr != nullptr) {
417 AddTraceMe("MemoryAllocation", ptr);
418 return ptr;
419 }
420
421 // Try to extend
422 if (Extend(unused_alignment, rounded_bytes)) {
423 ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
424 if (ptr != nullptr) {
425 AddTraceMe("MemoryAllocation", ptr);
426 return ptr;
427 }
428 }
429
430 if ((freed_before == 0) && (!timestamped_chunks_.empty())) {
431 // We're unable to satisfy an allocation request without a specific
432 // timestamp requirement. Rather than fail, try merging any held-out
433 // timestamped chunks more aggressively until a free chunk of the necessary
434 // size is formed.
435 if (MergeTimestampedChunks(rounded_bytes)) {
436 ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
437 if (ptr != nullptr) {
438 AddTraceMe("MemoryAllocation", ptr);
439 return ptr;
440 }
441 }
442 }
443
444 // Reaching this point means that no chunks can satisfy the request. Also,
445 // the unallocated bytes cannot satisfy the request. Before giving up, let's
446 // try deallocating free regions so that suballocator can combine them with
447 // the unallocated bytes and form a larger region.
448 if (DeallocateFreeRegions(rounded_bytes) &&
449 Extend(unused_alignment, rounded_bytes)) {
450 ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
451 if (ptr != nullptr) {
452 AddTraceMe("MemoryAllocation", ptr);
453 return ptr;
454 }
455 }
456
457 // We searched all bins for an existing free chunk to use and
458 // couldn't find one. This means we must have run out of memory,
459 // Dump the memory log for analysis.
460 MaybeWriteMemoryMap();
461 if (dump_log_on_failure) {
462 LOG(WARNING)
463 << "Allocator (" << Name() << ") ran out of memory trying "
464 << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
465 << " (rounded to " << rounded_bytes << ")"
466 << "requested by op "
467 << ScopedMemoryDebugAnnotation::CurrentAnnotation().pending_op_name
468 << "\nIf the cause is memory fragmentation maybe the environment "
469 << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will "
470 << "improve the situation. \nCurrent allocation summary follows."
471 << "\nCurrent allocation summary follows.";
472 DumpMemoryLog(rounded_bytes);
473 LOG(WARNING) << RenderOccupancy();
474 }
475 return nullptr;
476 }
477
LargestFreeChunk()478 int64 BFCAllocator::LargestFreeChunk() {
479 for (int i = kNumBins - 1; i >= 0; i--) {
480 if (!BinFromIndex(i)->free_chunks.empty()) {
481 return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size;
482 }
483 }
484 return 0;
485 }
486
GetFragmentation()487 double BFCAllocator::GetFragmentation() {
488 int64_t bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use;
489 DCHECK_GT(bytes_available, 0);
490 return static_cast<double>(bytes_available - LargestFreeChunk()) /
491 bytes_available;
492 }
493
AddTraceMe(absl::string_view traceme_name,const void * ptr)494 void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* ptr) {
495 BFCAllocator::Chunk* chunk = ChunkFromHandle(region_manager_.get_handle(ptr));
496 AddTraceMe(traceme_name, chunk->ptr, chunk->requested_size, chunk->size);
497 }
498
AddTraceMe(absl::string_view traceme_name,const void * chunk_ptr,int64_t req_bytes,int64_t alloc_bytes)499 void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
500 const void* chunk_ptr, int64_t req_bytes,
501 int64_t alloc_bytes) {
502 tensorflow::profiler::TraceMe::InstantActivity(
503 [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
504 TF_NO_THREAD_SAFETY_ANALYSIS {
505 int64_t bytes_available =
506 memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
507 const auto& annotation =
508 ScopedMemoryDebugAnnotation::CurrentAnnotation();
509 std::string tensor_shape;
510 if (annotation.pending_shape) {
511 tensor_shape = annotation.pending_shape->DebugString();
512 }
513 return tensorflow::profiler::TraceMeEncode(
514 traceme_name, {{"allocator_name", name_},
515 {"bytes_reserved", stats_.bytes_reserved},
516 {"bytes_allocated", stats_.bytes_in_use},
517 {"bytes_available", bytes_available},
518 {"fragmentation", GetFragmentation()},
519 {"peak_bytes_in_use", stats_.peak_bytes_in_use},
520 {"requested_bytes", req_bytes},
521 {"allocation_bytes", alloc_bytes},
522 {"addr", reinterpret_cast<uint64>(chunk_ptr)},
523 {"tf_op", annotation.pending_op_name},
524 {"id", annotation.pending_step_id},
525 {"region_type", annotation.pending_region_type},
526 {"data_type", annotation.pending_data_type},
527 {"shape", tensor_shape}});
528 },
529 /*level=*/profiler::TraceMeLevel::kInfo);
530 }
531
FindChunkPtr(BinNum bin_num,size_t rounded_bytes,size_t num_bytes,uint64 freed_before)532 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
533 size_t num_bytes, uint64 freed_before) {
534 // First identify the first bin that could satisfy rounded_bytes.
535 for (; bin_num < kNumBins; bin_num++) {
536 // Start searching from the first bin for the smallest chunk that fits
537 // rounded_bytes.
538 Bin* b = BinFromIndex(bin_num);
539 for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
540 ++citer) {
541 const BFCAllocator::ChunkHandle h = (*citer);
542 BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
543 DCHECK(!chunk->in_use());
544 if (freed_before > 0 && freed_before < chunk->freed_at_count) {
545 continue;
546 }
547 if (chunk->size >= rounded_bytes) {
548 // We found an existing chunk that fits us that wasn't in use, so remove
549 // it from the free bin structure prior to using.
550 RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
551
552 // If we can break the size of the chunk into two reasonably large
553 // pieces, do so. In any case don't waste more than a threshold of
554 // kMaxInternalFragmentation bytes on padding this alloc. If this
555 // threshold is not set by the user, then use 128MB as the default
556 // threshold.
557 const int64_t kMaxInternalFragmentation =
558 (internal_fragmentation_fraction_ > 0.0)
559 ? internal_fragmentation_fraction_ * memory_limit_
560 : 128 << 20;
561 if (chunk->size >= rounded_bytes * 2 ||
562 static_cast<int64>(chunk->size) - rounded_bytes >=
563 kMaxInternalFragmentation) {
564 SplitChunk(h, rounded_bytes);
565 chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved
566 }
567
568 // The requested size of the returned chunk is what the user
569 // has allocated.
570 chunk->requested_size = num_bytes;
571 // Assign a unique id and increment the id counter, marking the
572 // chunk as being in use.
573 chunk->allocation_id = next_allocation_id_++;
574
575 // Update stats.
576 ++stats_.num_allocs;
577 stats_.bytes_in_use += chunk->size;
578 if (stats_.bytes_in_use > stats_.peak_bytes_in_use) {
579 VLOG(2) << "New Peak memory usage of " << stats_.bytes_in_use
580 << " bytes for " << Name();
581 }
582 stats_.peak_bytes_in_use =
583 std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
584 stats_.largest_alloc_size =
585 std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
586
587 #ifdef TENSORFLOW_MEM_DEBUG
588 if (ShouldRecordOpName()) {
589 const auto& annotation =
590 ScopedMemoryDebugAnnotation::CurrentAnnotation();
591 if (annotation.pending_op_name != nullptr) {
592 chunk->op_name = annotation.pending_op_name;
593 } else {
594 LOG(INFO) << "missing pending_op_name for " << Name()
595 << " reading addr "
596 << static_cast<const void*>(&annotation.pending_op_name)
597 << "\n"
598 << CurrentStackTrace();
599 chunk->op_name = nullptr;
600 }
601 chunk->action_count = ++action_counter_;
602 chunk->step_id = annotation.pending_step_id;
603 int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
604 size_history_[slot] = stats_.bytes_in_use;
605 }
606 #endif
607
608 VLOG(4) << "Returning: " << chunk->ptr;
609 if (VLOG_IS_ON(4)) {
610 LOG(INFO) << "A: " << RenderOccupancy();
611 }
612 return chunk->ptr;
613 }
614 }
615 }
616
617 return nullptr;
618 }
619
SplitChunk(BFCAllocator::ChunkHandle h,size_t num_bytes)620 void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
621 // Allocate the new chunk before we do any ChunkFromHandle
622 ChunkHandle h_new_chunk = AllocateChunk();
623
624 Chunk* c = ChunkFromHandle(h);
625 CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
626
627 // Create a new chunk starting num_bytes after c
628 BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
629 new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
630 region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
631
632 // Set the new sizes of the chunks.
633 new_chunk->size = c->size - num_bytes;
634 c->size = num_bytes;
635
636 // The new chunk is not in use.
637 new_chunk->allocation_id = -1;
638
639 // It inherits the freed time.
640 new_chunk->freed_at_count = c->freed_at_count;
641
642 // Maintain the pointers.
643 // c <-> c_neighbor becomes
644 // c <-> new_chunk <-> c_neighbor
645 BFCAllocator::ChunkHandle h_neighbor = c->next;
646 new_chunk->prev = h;
647 new_chunk->next = h_neighbor;
648 c->next = h_new_chunk;
649 if (h_neighbor != kInvalidChunkHandle) {
650 Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
651 c_neighbor->prev = h_new_chunk;
652 }
653
654 // Add the newly free chunk to the free bin.
655 InsertFreeChunkIntoBin(h_new_chunk);
656 }
657
DeallocateRaw(void * ptr)658 void BFCAllocator::DeallocateRaw(void* ptr) {
659 VLOG(3) << "DeallocateRaw " << Name() << " "
660 << (ptr ? RequestedSize(ptr) : 0);
661 DeallocateRawInternal(ptr);
662 retry_helper_.NotifyDealloc();
663 }
664
DeallocateRawInternal(void * ptr)665 void BFCAllocator::DeallocateRawInternal(void* ptr) {
666 if (ptr == nullptr) {
667 VLOG(2) << "tried to deallocate nullptr";
668 return;
669 }
670 mutex_lock l(lock_);
671
672 // Find the chunk from the ptr.
673 BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
674 CHECK(h != kInvalidChunkHandle);
675 // Record chunk information before it's freed.
676 Chunk* chunk = ChunkFromHandle(h);
677 void* chunk_ptr = chunk->ptr;
678 int64_t req_bytes = chunk->requested_size;
679 int64_t alloc_bytes = chunk->size;
680
681 MarkFree(h);
682
683 // Consider coalescing it.
684 if (timing_counter_) {
685 InsertFreeChunkIntoBin(h);
686 timestamped_chunks_.push_back(h);
687 } else {
688 InsertFreeChunkIntoBin(TryToCoalesce(h, false));
689 }
690
691 // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for
692 // correct aggregation stats (bytes_in_use, fragmentation).
693 AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes);
694
695 if (VLOG_IS_ON(4)) {
696 LOG(INFO) << "F: " << RenderOccupancy();
697 }
698 }
699
700 // Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
701 // We merge Chunk(h2) into Chunk(h1).
Merge(BFCAllocator::ChunkHandle h1,BFCAllocator::ChunkHandle h2)702 void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
703 BFCAllocator::ChunkHandle h2) {
704 Chunk* c1 = ChunkFromHandle(h1);
705 Chunk* c2 = ChunkFromHandle(h2);
706 // We can only merge chunks that are not in use.
707 CHECK(!c1->in_use() && !c2->in_use());
708
709 // c1's prev doesn't change, still points to the same ptr, and is
710 // still not in use.
711
712 // Fix up neighbor pointers
713 //
714 // c1 <-> c2 <-> c3 should become
715 // c1 <-> c3
716
717 BFCAllocator::ChunkHandle h3 = c2->next;
718 c1->next = h3;
719 CHECK(c2->prev == h1);
720 if (h3 != kInvalidChunkHandle) {
721 BFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
722 c3->prev = h1;
723 }
724
725 // Set the new size
726 c1->size += c2->size;
727
728 // Pick latest free time.
729 c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count);
730
731 DeleteChunk(h2);
732 }
733
DeleteChunk(ChunkHandle h)734 void BFCAllocator::DeleteChunk(ChunkHandle h) {
735 // Delete h and cleanup all state
736 Chunk* c = ChunkFromHandle(h);
737 // VLOG(4) << "Removing: " << c->ptr;
738 region_manager_.erase(c->ptr);
739 DeallocateChunk(h);
740 }
741
InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h)742 void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
743 Chunk* c = ChunkFromHandle(h);
744 CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
745 BinNum bin_num = BinNumForSize(c->size);
746 Bin* new_bin = BinFromIndex(bin_num);
747 c->bin_num = bin_num;
748 new_bin->free_chunks.insert(h);
749 }
750
RemoveFreeChunkIterFromBin(BFCAllocator::Bin::FreeChunkSet * free_chunks,const BFCAllocator::Bin::FreeChunkSet::iterator & citer)751 void BFCAllocator::RemoveFreeChunkIterFromBin(
752 BFCAllocator::Bin::FreeChunkSet* free_chunks,
753 const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
754 ChunkHandle h = *citer;
755 Chunk* c = ChunkFromHandle(h);
756 CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
757 free_chunks->erase(citer);
758 c->bin_num = kInvalidBinNum;
759 }
760
RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h)761 void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
762 Chunk* c = ChunkFromHandle(h);
763 CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
764 CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), 0)
765 << "Could not find chunk in bin";
766 c->bin_num = kInvalidBinNum;
767 }
768
MarkFree(BFCAllocator::ChunkHandle h)769 void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
770 Chunk* c = ChunkFromHandle(h);
771 CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
772
773 // Mark the chunk as no longer in use.
774 c->allocation_id = -1;
775
776 // Optionally record the free time.
777 if (timing_counter_) {
778 c->freed_at_count = timing_counter_->next();
779 }
780
781 // Updates the stats.
782 stats_.bytes_in_use -= c->size;
783
784 #ifdef TENSORFLOW_MEM_DEBUG
785 if (ShouldRecordOpName()) {
786 c->action_count = ++action_counter_;
787 int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
788 size_history_[slot] = stats_.bytes_in_use;
789 }
790 #endif
791 }
792
TryToCoalesce(ChunkHandle h,bool ignore_freed_at)793 BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
794 bool ignore_freed_at) {
795 Chunk* c = ChunkFromHandle(h);
796 if ((!ignore_freed_at) && c->freed_at_count > 0) return h;
797 ChunkHandle coalesced_chunk = h;
798
799 // If the next chunk is free, merge it into c and delete it.
800 if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) {
801 Chunk* n = ChunkFromHandle(c->next);
802 if ((n->freed_at_count == 0) || ignore_freed_at) {
803 VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr;
804 RemoveFreeChunkFromBin(c->next);
805 Merge(h, c->next);
806 }
807 }
808
809 // If the previous chunk is free, merge c into it and delete c.
810 if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) {
811 Chunk* n = ChunkFromHandle(c->prev);
812 if ((n->freed_at_count == 0) || ignore_freed_at) {
813 VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr;
814 coalesced_chunk = c->prev;
815 RemoveFreeChunkFromBin(c->prev);
816 Merge(c->prev, h);
817 }
818 }
819
820 return coalesced_chunk;
821 }
822
SetSafeFrontier(uint64 count)823 void BFCAllocator::SetSafeFrontier(uint64 count) {
824 uint64 current = safe_frontier_.load(std::memory_order_relaxed);
825 while (count > current) {
826 if (safe_frontier_.compare_exchange_strong(current, count)) {
827 retry_helper_.NotifyDealloc();
828 return;
829 } else {
830 current = safe_frontier_.load(std::memory_order_relaxed);
831 }
832 }
833 }
834
MergeTimestampedChunks(size_t required_bytes)835 bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
836 VLOG(1) << "MergeTimestampedChunks queue_len=" << timestamped_chunks_.size()
837 << " required_bytes=" << required_bytes;
838 bool satisfied = (required_bytes == 0);
839 std::vector<void*> to_merge;
840 std::deque<ChunkHandle> new_ts_queue;
841 while (!timestamped_chunks_.empty()) {
842 ChunkHandle h = timestamped_chunks_.front();
843 timestamped_chunks_.pop_front();
844 DCHECK_NE(h, kInvalidChunkHandle);
845 Chunk* c = ChunkFromHandle(h);
846 // It's possible this chunk has already been merged so refetch and retest
847 // the handle.
848 h = region_manager_.get_handle(c->ptr);
849 if (h == kInvalidChunkHandle) {
850 continue;
851 }
852 if (c->in_use() || (c->bin_num == kInvalidBinNum)) {
853 // This chunk has already been reallocated.
854 continue;
855 }
856 if (c->freed_at_count == 0) {
857 to_merge.push_back(c->ptr);
858 continue;
859 }
860 // Chunk should be free and assigned to a bin.
861 DCHECK_NE(c->bin_num, kInvalidBinNum);
862 if (c->freed_at_count < safe_frontier_) {
863 c->freed_at_count = 0;
864 to_merge.push_back(c->ptr);
865 } else if (required_bytes > 0) {
866 to_merge.push_back(c->ptr);
867 } else {
868 new_ts_queue.push_back(h);
869 }
870 }
871 DCHECK(timestamped_chunks_.empty());
872 std::swap(timestamped_chunks_, new_ts_queue);
873
874 // At this point all candidate chunks have been moved from timestamped_chunks_
875 // to to_merge. If this is a standard merge (required_bytes == 0) then
876 // merge them all, otherwise merge just until a Chunk of the required size
877 // is produced.
878 for (int ci = 0, end = to_merge.size(); ci < end; ++ci) {
879 void* ptr = to_merge[ci];
880 // It's possible that the Chunk associated with this memory location got
881 // merged and deallocated in a prior iteration so refetch the handle and
882 // retest.
883 ChunkHandle h = region_manager_.get_handle(ptr);
884 if (h == kInvalidChunkHandle) continue;
885 if (required_bytes == 0 || !satisfied) {
886 Chunk* c = ChunkFromHandle(h);
887 DCHECK_NE(c->bin_num, kInvalidBinNum);
888 DCHECK(!c->in_use());
889 RemoveFreeChunkFromBin(h);
890 ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0));
891 InsertFreeChunkIntoBin(new_h);
892 if (required_bytes > 0) {
893 c = ChunkFromHandle(new_h);
894 if (new_h != h && c->freed_at_count > 0) {
895 timestamped_chunks_.push_back(new_h);
896 }
897 if (c->size >= required_bytes) {
898 satisfied = true;
899 }
900 }
901 } else {
902 // We were force merging Chunks with unsafe timestamps, but managed
903 // to create a satisfying Chunk so just requeue the rest.
904 timestamped_chunks_.push_back(h);
905 }
906 }
907 return satisfied;
908 }
909
TracksAllocationSizes() const910 bool BFCAllocator::TracksAllocationSizes() const { return true; }
911
RequestedSize(const void * ptr) const912 size_t BFCAllocator::RequestedSize(const void* ptr) const {
913 CHECK(ptr);
914 mutex_lock l(lock_);
915 BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
916 CHECK(h != kInvalidChunkHandle)
917 << "Asked for requested size of pointer we never allocated: " << ptr;
918 const BFCAllocator::Chunk* c = ChunkFromHandle(h);
919 return c->requested_size;
920 }
921
AllocatedSize(const void * ptr) const922 size_t BFCAllocator::AllocatedSize(const void* ptr) const {
923 mutex_lock l(lock_);
924 BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
925 CHECK(h != kInvalidChunkHandle)
926 << "Asked for allocated size of pointer we never allocated: " << ptr;
927 const BFCAllocator::Chunk* c = ChunkFromHandle(h);
928 return c->size;
929 }
930
AllocationId(const void * ptr) const931 int64 BFCAllocator::AllocationId(const void* ptr) const {
932 mutex_lock l(lock_);
933 BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
934 CHECK(h != kInvalidChunkHandle)
935 << "Asked for allocation id of pointer we never allocated: " << ptr;
936 const BFCAllocator::Chunk* c = ChunkFromHandle(h);
937 return c->allocation_id;
938 }
939
940 namespace {
941
RenderRegion(char * rendered,const size_t resolution,const size_t total_render_size,const size_t offset,const void * base_ptr,const void * ptr,const size_t size,const char c)942 void RenderRegion(char* rendered, const size_t resolution,
943 const size_t total_render_size, const size_t offset,
944 const void* base_ptr, const void* ptr, const size_t size,
945 const char c) {
946 const char* base_ptr_c = static_cast<const char*>(base_ptr);
947 const char* ptr_c = static_cast<const char*>(ptr);
948
949 size_t start_location =
950 ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
951 CHECK_GE(start_location, 0);
952 CHECK_LT(start_location, resolution);
953 size_t end_location =
954 ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
955 total_render_size;
956 CHECK_GE(end_location, 0);
957 CHECK_LT(end_location, resolution);
958
959 for (size_t i = start_location; i <= end_location; ++i) {
960 rendered[i] = c;
961 }
962 }
963
964 } // namespace
965
RenderOccupancy()966 string BFCAllocator::RenderOccupancy() {
967 // Make a buffer for the ASCII-art representation.
968 const size_t resolution = 100;
969 char rendered[resolution];
970
971 // Compute the total region size to render over
972 size_t total_region_size = 0;
973 for (const auto& region : region_manager_.regions()) {
974 total_region_size += region.memory_size();
975 }
976
977 if (total_region_size == 0) {
978 return "<allocator contains no memory>";
979 }
980
981 // Start out with everything empty
982 RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
983 total_region_size, '_');
984
985 size_t region_offset = 0;
986 for (const auto& region : region_manager_.regions()) {
987 ChunkHandle h = region_manager_.get_handle(region.ptr());
988 // Then render each chunk left to right.
989 while (h != kInvalidChunkHandle) {
990 Chunk* c = ChunkFromHandle(h);
991 if (c->in_use()) {
992 // Render the wasted space
993 size_t wasted = c->size - c->requested_size;
994 if (wasted > 0) {
995 RenderRegion(rendered, resolution, total_region_size,
996 region_offset + c->requested_size, region.ptr(), c->ptr,
997 wasted, 'x');
998 }
999 // Then the occupied space
1000 RenderRegion(rendered, resolution, total_region_size, region_offset,
1001 region.ptr(), c->ptr, c->requested_size, '*');
1002 }
1003 h = c->next;
1004 }
1005 region_offset += region.memory_size();
1006 }
1007
1008 return string(rendered, resolution);
1009 }
1010
DumpMemoryLog(size_t num_bytes)1011 void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
1012 const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1013 LOG(INFO) << "BFCAllocator dump for " << Name();
1014 for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
1015 Bin* b = BinFromIndex(bin_num);
1016 const BinDebugInfo& bin_info = bin_infos[bin_num];
1017 CHECK_EQ(b->free_chunks.size(),
1018 bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1019
1020 LOG(INFO) << "Bin (" << b->bin_size
1021 << "): \tTotal Chunks: " << bin_info.total_chunks_in_bin
1022 << ", Chunks in use: " << bin_info.total_chunks_in_use << ". "
1023 << strings::HumanReadableNumBytes(bin_info.total_bytes_in_bin)
1024 << " allocated for chunks. "
1025 << strings::HumanReadableNumBytes(bin_info.total_bytes_in_use)
1026 << " in use in bin. "
1027 << strings::HumanReadableNumBytes(
1028 bin_info.total_requested_bytes_in_use)
1029 << " client-requested in use in bin.";
1030 }
1031
1032 // Find the bin that we would have liked to allocate in, so we
1033 // can get some further analysis about fragmentation.
1034 Bin* b = BinForSize(num_bytes);
1035
1036 LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
1037 << " was " << strings::HumanReadableNumBytes(b->bin_size)
1038 << ", Chunk State: ";
1039
1040 for (ChunkHandle h : b->free_chunks) {
1041 Chunk* c = ChunkFromHandle(h);
1042 LOG(INFO) << c->DebugString(this, true);
1043 }
1044
1045 // Next show the chunks that are in use, and also summarize their
1046 // number by size.
1047 std::map<size_t, int> in_use_by_size;
1048 for (const auto& region : region_manager_.regions()) {
1049 LOG(INFO) << "Next region of size " << region.memory_size();
1050 ChunkHandle h = region_manager_.get_handle(region.ptr());
1051 while (h != kInvalidChunkHandle) {
1052 const Chunk* c = ChunkFromHandle(h);
1053 if (c->in_use()) {
1054 in_use_by_size[c->size]++;
1055 }
1056 string buf = strings::StrCat(
1057 (c->in_use() ? "InUse" : "Free "), " at ",
1058 strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
1059 #ifdef TENSORFLOW_MEM_DEBUG
1060 if (ShouldRecordOpName()) {
1061 strings::StrAppend(&buf, " by op ", c->op_name, " action_count ",
1062 c->action_count, " step ", c->step_id);
1063 }
1064 #endif
1065 strings::StrAppend(&buf, " next ", c->next);
1066 if (timing_counter_) {
1067 strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
1068 }
1069 LOG(INFO) << buf;
1070 h = c->next;
1071 }
1072 }
1073
1074 LOG(INFO) << " Summary of in-use Chunks by size: ";
1075 size_t total_bytes = 0;
1076 for (auto& it : in_use_by_size) {
1077 LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
1078 << strings::HumanReadableNumBytes(it.first * it.second);
1079 total_bytes += (it.first * it.second);
1080 }
1081 LOG(INFO) << "Sum Total of in-use chunks: "
1082 << strings::HumanReadableNumBytes(total_bytes);
1083 LOG(INFO) << "total_region_allocated_bytes_: "
1084 << total_region_allocated_bytes_
1085 << " memory_limit_: " << memory_limit_ << " available bytes: "
1086 << (memory_limit_ - total_region_allocated_bytes_)
1087 << " curr_region_allocation_bytes_: "
1088 << curr_region_allocation_bytes_;
1089 LOG(INFO) << "Stats: \n" << stats_.DebugString();
1090 }
1091
MaybeWriteMemoryMap()1092 void BFCAllocator::MaybeWriteMemoryMap() {
1093 const char* gpu_memory_map_file = std::getenv("TF_BFC_MEMORY_DUMP");
1094 if (gpu_memory_map_file != nullptr) {
1095 std::unique_ptr<WritableFile> dump_file;
1096 string file_name = strings::StrCat(gpu_memory_map_file, "_", Name(), ".",
1097 Env::Default()->NowMicros());
1098 Status status = Env::Default()->NewWritableFile(file_name, &dump_file);
1099 if (!status.ok()) {
1100 LOG(ERROR) << "Failed to open file " << file_name;
1101 return;
1102 }
1103 MemoryDump md = RecordMemoryMapInternal();
1104 status = dump_file->Append(md.SerializeAsString());
1105 if (!status.ok()) {
1106 LOG(ERROR) << "Error on writing to file " << gpu_memory_map_file << ": "
1107 << status;
1108 }
1109 }
1110 }
1111
RecordMemoryMap()1112 MemoryDump BFCAllocator::RecordMemoryMap() {
1113 mutex_lock l(lock_);
1114 return RecordMemoryMapInternal();
1115 }
1116
RecordMemoryMapInternal()1117 MemoryDump BFCAllocator::RecordMemoryMapInternal() {
1118 MemoryDump md;
1119 md.set_allocator_name(Name());
1120
1121 // Record the general stats
1122 MemAllocatorStats* mas = md.mutable_stats();
1123 mas->set_num_allocs(stats_.num_allocs);
1124 mas->set_bytes_in_use(stats_.bytes_in_use);
1125 mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
1126 mas->set_largest_alloc_size(stats_.largest_alloc_size);
1127
1128 // Record summary data for every bin.
1129 const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1130 for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
1131 Bin* b = BinFromIndex(bin_num);
1132 const BinDebugInfo& bin_info = bin_infos[bin_num];
1133 DCHECK_EQ(b->free_chunks.size(),
1134 bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1135 BinSummary* bs = md.add_bin_summary();
1136 bs->set_bin(bin_num);
1137 bs->set_total_bytes_in_use(bin_info.total_bytes_in_use);
1138 bs->set_total_bytes_in_bin(bin_info.total_bytes_in_bin);
1139 bs->set_total_chunks_in_use(bin_info.total_chunks_in_use);
1140 bs->set_total_chunks_in_bin(bin_info.total_chunks_in_bin);
1141 }
1142
1143 // Record state of every defined Chunk.
1144 for (const auto& region : region_manager_.regions()) {
1145 ChunkHandle h = region_manager_.get_handle(region.ptr());
1146 while (h != kInvalidChunkHandle) {
1147 const Chunk* c = ChunkFromHandle(h);
1148 MemChunk* mc = md.add_chunk();
1149 mc->set_in_use(c->in_use());
1150 mc->set_address(reinterpret_cast<uint64>(c->ptr));
1151 mc->set_size(c->size);
1152 mc->set_requested_size(c->requested_size);
1153 mc->set_bin(c->bin_num);
1154 #ifdef TENSORFLOW_MEM_DEBUG
1155 mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN");
1156 mc->set_step_id(c->step_id);
1157 mc->set_action_count(c->action_count);
1158 #endif
1159 if (timing_counter_) {
1160 mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
1161 }
1162 h = c->next;
1163 }
1164 }
1165
1166 mas->set_fragmentation_metric(GetFragmentation());
1167
1168 #ifdef TENSORFLOW_MEM_DEBUG
1169 // Record the recent size history
1170 int history_len = static_cast<int>(std::min(
1171 action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
1172 for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
1173 SnapShot* ss = md.add_snap_shot();
1174 ss->set_action_count(i);
1175 int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
1176 ss->set_size(size_history_[slot]);
1177 }
1178 #endif
1179
1180 return md;
1181 }
1182
GetStats()1183 absl::optional<AllocatorStats> BFCAllocator::GetStats() {
1184 mutex_lock l(lock_);
1185 return stats_;
1186 }
1187
ClearStats()1188 bool BFCAllocator::ClearStats() {
1189 mutex_lock l(lock_);
1190 stats_.num_allocs = 0;
1191 stats_.peak_bytes_in_use = stats_.bytes_in_use;
1192 stats_.largest_alloc_size = 0;
1193 return true;
1194 }
1195
1196 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
get_bin_debug_info()1197 BFCAllocator::get_bin_debug_info() {
1198 std::array<BinDebugInfo, kNumBins> bin_infos;
1199 for (const auto& region : region_manager_.regions()) {
1200 ChunkHandle h = region_manager_.get_handle(region.ptr());
1201 while (h != kInvalidChunkHandle) {
1202 const Chunk* c = ChunkFromHandle(h);
1203 BinNum bin_num = BinNumForSize(c->size);
1204 BinDebugInfo& bin_info = bin_infos[bin_num];
1205 bin_info.total_bytes_in_bin += c->size;
1206 bin_info.total_chunks_in_bin++;
1207 if (c->in_use()) {
1208 bin_info.total_bytes_in_use += c->size;
1209 bin_info.total_requested_bytes_in_use += c->requested_size;
1210 bin_info.total_chunks_in_use++;
1211 } else {
1212 Bin* bin = BinFromIndex(bin_num);
1213 CHECK_EQ(bin->free_chunks.count(h), 1);
1214 CHECK_EQ(c->bin_num, bin_num);
1215 }
1216 h = c->next;
1217 }
1218 }
1219 return bin_infos;
1220 }
1221
1222 } // namespace tensorflow
1223