1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 18 19 #include <stdlib.h> 20 21 #include <functional> 22 #include <limits> 23 24 #include "absl/strings/string_view.h" 25 #include "absl/types/optional.h" 26 #include "tensorflow/core/framework/numeric_types.h" 27 #include "tensorflow/core/framework/type_traits.h" 28 #include "tensorflow/core/platform/logging.h" 29 #include "tensorflow/core/platform/macros.h" 30 #include "tensorflow/core/platform/numa.h" 31 #include "tensorflow/core/platform/types.h" 32 33 namespace tensorflow { 34 35 class TensorShape; 36 37 // Attributes for a single allocation call. Different calls to the same 38 // allocator could potentially have different allocation attributes. 39 struct AllocationAttributes { 40 AllocationAttributes() = default; 41 AllocationAttributesAllocationAttributes42 AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged, 43 std::function<uint64()>* freed_by_func) 44 : retry_on_failure(retry_on_failure), 45 allocation_will_be_logged(allocation_will_be_logged), 46 freed_by_func(freed_by_func) {} 47 48 // If the first attempt to allocate the memory fails, the allocation should 49 // wait and retry (with a timeout). 50 // 51 // This is usually set to true, but we may set it to false in cases where a 52 // failure has only performance impact (e.g. optional scratch space 53 // allocation). 54 bool retry_on_failure = true; 55 // If a Tensor is allocated without the following set to true, then 56 // it is logged as an unknown allocation. During execution Tensors 57 // should be allocated through the OpKernelContext which records 58 // which Op is performing the allocation, and sets this flag to 59 // true. 60 bool allocation_will_be_logged = false; 61 // EXPERIMENTAL: If provided, then evaluates to a timing count such that only 62 // a memory chunk whose freed_at_count is at this value or earlier may be 63 // returned. 64 std::function<uint64()>* freed_by_func = nullptr; // Not owned. 65 66 TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes); 67 }; 68 69 // Annotations for memory profiling and debugging purpose. The runtime will 70 // cache the annotations in thread-local memory, and some allocators will try to 71 // tag allocations with the annotations. 72 struct MemoryDebugAnnotation { 73 const char* pending_op_name = nullptr; 74 int64 pending_step_id = 0; 75 const char* pending_region_type = nullptr; 76 int32 pending_data_type = 0; 77 const TensorShape* pending_shape = nullptr; 78 }; 79 80 // Wrapper class of MemoryDebugAnnotation for RAII. 81 class ScopedMemoryDebugAnnotation { 82 public: CurrentAnnotation()83 static const MemoryDebugAnnotation& CurrentAnnotation() { 84 return annotation_; 85 } 86 ScopedMemoryDebugAnnotation(const char * op_name)87 explicit ScopedMemoryDebugAnnotation(const char* op_name) { 88 last_annotation_ = annotation_; 89 CleanupAnnotation(); 90 annotation_.pending_op_name = op_name; 91 } 92 ScopedMemoryDebugAnnotation(const char * op_name,int64_t step_id)93 explicit ScopedMemoryDebugAnnotation(const char* op_name, int64_t step_id) { 94 last_annotation_ = annotation_; 95 CleanupAnnotation(); 96 annotation_.pending_op_name = op_name; 97 annotation_.pending_step_id = step_id; 98 } 99 100 // This constructor keeps the pending_op_name and pending_step_id from parent 101 // (if any). Otherwise it overwrites with op_name. ScopedMemoryDebugAnnotation(const char * op_name,const char * region_type,int32_t data_type,const TensorShape * shape)102 explicit ScopedMemoryDebugAnnotation(const char* op_name, 103 const char* region_type, 104 int32_t data_type, 105 const TensorShape* shape) { 106 last_annotation_ = annotation_; 107 if (!annotation_.pending_op_name) { 108 annotation_.pending_op_name = op_name; 109 } 110 annotation_.pending_region_type = region_type; 111 annotation_.pending_data_type = data_type; 112 annotation_.pending_shape = shape; 113 } 114 ScopedMemoryDebugAnnotation(const char * op_name,int64_t step_id,const char * region_type,int32_t data_type,const TensorShape * shape)115 explicit ScopedMemoryDebugAnnotation(const char* op_name, int64_t step_id, 116 const char* region_type, 117 int32_t data_type, 118 const TensorShape* shape) { 119 last_annotation_ = annotation_; 120 annotation_.pending_op_name = op_name; 121 annotation_.pending_step_id = step_id; 122 annotation_.pending_region_type = region_type; 123 annotation_.pending_data_type = data_type; 124 annotation_.pending_shape = shape; 125 } 126 ~ScopedMemoryDebugAnnotation()127 ~ScopedMemoryDebugAnnotation() { annotation_ = last_annotation_; } 128 129 private: CleanupAnnotation()130 void CleanupAnnotation() { 131 annotation_.pending_op_name = nullptr; 132 annotation_.pending_step_id = 0; 133 annotation_.pending_region_type = nullptr; 134 annotation_.pending_data_type = 0; 135 annotation_.pending_shape = nullptr; 136 } 137 138 // Stores the current annotations. 139 static thread_local MemoryDebugAnnotation annotation_; 140 141 // Stores the previous values in case the annotations are nested. 142 MemoryDebugAnnotation last_annotation_; 143 144 TF_DISALLOW_COPY_AND_ASSIGN(ScopedMemoryDebugAnnotation); 145 }; 146 147 // Runtime statistics collected by an allocator. Exactly the same as 148 // stream_executor::AllocatorStats, but independently defined to preserve the 149 // mutual independence of StreamExecutor and TensorFlow. 150 struct AllocatorStats { 151 int64 num_allocs; // Number of allocations. 152 int64 bytes_in_use; // Number of bytes in use. 153 int64 peak_bytes_in_use; // The peak bytes in use. 154 int64 largest_alloc_size; // The largest single allocation seen. 155 156 // The upper limit of bytes of user allocatable device memory, if such a limit 157 // is known. 158 absl::optional<int64> bytes_limit; 159 160 // Stats for reserved memory usage. 161 int64 bytes_reserved; // Number of bytes reserved. 162 int64 peak_bytes_reserved; // The peak number of bytes reserved. 163 // The upper limit on the number bytes of reservable memory, 164 // if such a limit is known. 165 absl::optional<int64> bytes_reservable_limit; 166 167 int64 largest_free_block_bytes; // Largest free block's size in heap. 168 AllocatorStatsAllocatorStats169 AllocatorStats() 170 : num_allocs(0), 171 bytes_in_use(0), 172 peak_bytes_in_use(0), 173 largest_alloc_size(0), 174 bytes_reserved(0), 175 peak_bytes_reserved(0), 176 largest_free_block_bytes(0) {} 177 178 std::string DebugString() const; 179 }; 180 181 // Allocator is an abstract interface for allocating and deallocating 182 // device memory. 183 class Allocator { 184 public: 185 // Align to 64 byte boundary. 186 static constexpr size_t kAllocatorAlignment = 64; 187 188 virtual ~Allocator(); 189 190 // Return a string identifying this allocator 191 virtual std::string Name() = 0; 192 193 // Return an uninitialized block of memory that is "num_bytes" bytes 194 // in size. The returned pointer is guaranteed to be aligned to a 195 // multiple of "alignment" bytes. 196 // REQUIRES: "alignment" is a power of 2. 197 virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0; 198 199 // Return an uninitialized block of memory that is "num_bytes" bytes 200 // in size with specified allocation attributes. The returned pointer is 201 // guaranteed to be aligned to a multiple of "alignment" bytes. 202 // REQUIRES: "alignment" is a power of 2. AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)203 virtual void* AllocateRaw(size_t alignment, size_t num_bytes, 204 const AllocationAttributes& allocation_attr) { 205 // The default behavior is to use the implementation without any allocation 206 // attributes. 207 return AllocateRaw(alignment, num_bytes); 208 } 209 210 // Deallocate a block of memory pointer to by "ptr" 211 // REQUIRES: "ptr" was previously returned by a call to AllocateRaw 212 virtual void DeallocateRaw(void* ptr) = 0; 213 214 // Returns true if this allocator tracks the sizes of allocations. 215 // RequestedSize and AllocatedSize must be overridden if 216 // TracksAllocationSizes is overridden to return true. TracksAllocationSizes()217 virtual bool TracksAllocationSizes() const { return false; } 218 219 // Returns true if this allocator allocates an opaque handle rather than the 220 // requested number of bytes. 221 // 222 // This method returns false for most allocators, but may be used by 223 // special-case allocators that track tensor usage. If this method returns 224 // true, AllocateRaw() should be invoked for all values of `num_bytes`, 225 // including 0. 226 // 227 // NOTE: It is the caller's responsibility to track whether an allocated 228 // object is a buffer or an opaque handle. In particular, when this method 229 // returns `true`, users of this allocator must not run any constructors or 230 // destructors for complex objects, since there is no backing store for the 231 // tensor in which to place their outputs. AllocatesOpaqueHandle()232 virtual bool AllocatesOpaqueHandle() const { return false; } 233 234 // Returns the user-requested size of the data allocated at 235 // 'ptr'. Note that the actual buffer allocated might be larger 236 // than requested, but this function returns the size requested by 237 // the user. 238 // 239 // REQUIRES: TracksAllocationSizes() is true. 240 // 241 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 242 // allocated by this allocator. RequestedSize(const void * ptr)243 virtual size_t RequestedSize(const void* ptr) const { 244 CHECK(false) << "allocator doesn't track sizes"; 245 return size_t(0); 246 } 247 248 // Returns the allocated size of the buffer at 'ptr' if known, 249 // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is 250 // guaranteed to be >= RequestedSize(ptr). 251 // 252 // REQUIRES: TracksAllocationSizes() is true. 253 // 254 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 255 // allocated by this allocator. AllocatedSize(const void * ptr)256 virtual size_t AllocatedSize(const void* ptr) const { 257 return RequestedSize(ptr); 258 } 259 260 // Returns either 0 or an identifier assigned to the buffer at 'ptr' 261 // when the buffer was returned by AllocateRaw. If non-zero, the 262 // identifier differs from every other ID assigned by this 263 // allocator. 264 // 265 // REQUIRES: TracksAllocationSizes() is true. 266 // 267 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 268 // allocated by this allocator. AllocationId(const void * ptr)269 virtual int64 AllocationId(const void* ptr) const { return 0; } 270 271 // Returns the allocated size of the buffer at 'ptr' if known, 272 // otherwise returns 0. This method can be called when 273 // TracksAllocationSizes() is false, but can be extremely slow. 274 // 275 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 276 // allocated by this allocator. AllocatedSizeSlow(const void * ptr)277 virtual size_t AllocatedSizeSlow(const void* ptr) const { 278 if (TracksAllocationSizes()) { 279 return AllocatedSize(ptr); 280 } 281 return 0; 282 } 283 284 // Fills in 'stats' with statistics collected by this allocator. GetStats()285 virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; } 286 287 // If implemented, clears the internal stats except for the `in_use` fields 288 // and sets the `peak_bytes_in_use` to be equal to the `bytes_in_use`. Returns 289 // true if implemented. 290 // 291 // REQUIRES: GetStats is overridden. ClearStats()292 virtual bool ClearStats() TF_MUST_USE_RESULT { return false; } 293 SetSafeFrontier(uint64 count)294 virtual void SetSafeFrontier(uint64 count) {} 295 296 // For allocator that are stream aware, allow to specify the compute 297 // stream this allocator is used for. SetStream(void * stream)298 virtual void SetStream(void* stream) {} 299 }; 300 301 // An implementation of Allocator that delegates all calls to another Allocator. 302 // 303 // Useful to clients who want to override part of the functionality of another 304 // allocator. 305 class AllocatorWrapper : public Allocator { 306 public: AllocatorWrapper(Allocator * wrapped)307 explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {} 308 ~AllocatorWrapper()309 ~AllocatorWrapper() override {} 310 311 // Returns the wrapped allocator to which all calls are delegated. wrapped()312 Allocator* wrapped() const { return wrapped_; } 313 Name()314 std::string Name() override { return wrapped_->Name(); } 315 AllocateRaw(size_t alignment,size_t num_bytes)316 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 317 return wrapped_->AllocateRaw(alignment, num_bytes); 318 } 319 AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)320 void* AllocateRaw(size_t alignment, size_t num_bytes, 321 const AllocationAttributes& allocation_attr) override { 322 return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr); 323 } 324 DeallocateRaw(void * ptr)325 void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); } 326 TracksAllocationSizes()327 bool TracksAllocationSizes() const override { 328 return wrapped_->TracksAllocationSizes(); 329 } 330 AllocatesOpaqueHandle()331 bool AllocatesOpaqueHandle() const override { 332 return wrapped_->AllocatesOpaqueHandle(); 333 } 334 RequestedSize(const void * ptr)335 size_t RequestedSize(const void* ptr) const override { 336 return wrapped_->RequestedSize(ptr); 337 } 338 AllocatedSize(const void * ptr)339 size_t AllocatedSize(const void* ptr) const override { 340 return wrapped_->AllocatedSize(ptr); 341 } 342 AllocationId(const void * ptr)343 int64 AllocationId(const void* ptr) const override { 344 return wrapped_->AllocationId(ptr); 345 } 346 AllocatedSizeSlow(const void * ptr)347 size_t AllocatedSizeSlow(const void* ptr) const override { 348 return wrapped_->AllocatedSizeSlow(ptr); 349 } 350 351 private: 352 Allocator* const wrapped_; 353 }; 354 355 // A tensorflow Op may need access to different kinds of memory that 356 // are not simply a function of the device to which the Op has been 357 // assigned. For example, an Op executing on a GPU may still need 358 // to allocate CPU RAM for some purpose. Internal to the tensorflow 359 // runtime we may choose to allocate CPU ram from special regions 360 // that have been prepared for higher performance in some use 361 // contexts, e.g. doing DMA with particular devices. For these 362 // reasons, the Device interface does not expose just one memory 363 // Allocator, but instead provides an accessor that takes a 364 // specification of the desired memory attributes in order to select 365 // an Allocator. 366 // 367 // Example use: 368 // // Allocator for ordinary device memory: 369 // Allocator* a = allocator(AllocatorAttributes()); 370 // ... 371 // // Allocator for CPU RAM, regardless of where Op is executing: 372 // AllocatorAttributes attr; 373 // attr.set_on_host(true); 374 // Allocator* a = allocator(attr); 375 struct AllocatorAttributes { set_on_hostAllocatorAttributes376 void set_on_host(bool v) { value |= (static_cast<int>(v)); } on_hostAllocatorAttributes377 bool on_host() const { return value & 0x1; } set_nic_compatibleAllocatorAttributes378 void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); } nic_compatibleAllocatorAttributes379 bool nic_compatible() const { return value & (0x1 << 1); } set_gpu_compatibleAllocatorAttributes380 void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); } gpu_compatibleAllocatorAttributes381 bool gpu_compatible() const { return value & (0x1 << 2); } MergeAllocatorAttributes382 void Merge(AllocatorAttributes other) { 383 value |= other.value; 384 if (scope_id != other.scope_id) { 385 CHECK(scope_id == 0 || other.scope_id == 0) 386 << "At least one scope_id should be zero to merge " 387 "AllocatorAttributes but found this.scope_id=" 388 << scope_id << " and other.scope_id=" << other.scope_id; 389 scope_id = scope_id == 0 ? other.scope_id : scope_id; 390 } 391 } 392 // Returns true if the fields set in *this is a subset of or equal to 393 // those set in other. IsEqualOrLessRestrictiveThanAllocatorAttributes394 bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const { 395 return (value | other.value) == other.value; 396 } 397 398 // NOTE: The upper 8 bits of the value are reserved for 399 // device-specific uses. Implementors of a device can interpret these 400 // upper 8 bits in device-specific ways, and ops implemented for those 401 // devices are responsible for setting those 8 bits appropriately. 402 uint32 value = 0; 403 // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to 404 // a named special-purpose allocator on the same device. 405 int32 scope_id = 0; 406 407 // Returns a human readable representation of this. 408 std::string DebugString() const; 409 }; 410 411 // Returns a trivial implementation of Allocator, which is a process singleton. 412 // Access through this function is only intended for use by restricted parts 413 // of the infrastructure. 414 Allocator* cpu_allocator_base(); 415 416 // If available, calls ProcessState::GetCPUAllocator(numa_node). 417 // If not, falls back to cpu_allocator_base(). 418 // Intended for use in contexts where ProcessState is not visible at 419 // compile time. Where ProcessState is visible, it's preferable to 420 // call it directly. 421 Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity); 422 423 // Enables AllocatorStats in the default CPU allocator implementation. By 424 // default, it's disabled. 425 void EnableCPUAllocatorStats(); 426 // Disables AllocatorStats in the default CPU allocator implementation. By 427 // default, it's disabled. 428 void DisableCPUAllocatorStats(); 429 bool CPUAllocatorStatsEnabled(); 430 431 // Enables full statistics collection in the default CPU allocator 432 // implementation. By default, it's disabled. 433 void EnableCPUAllocatorFullStats(); 434 bool CPUAllocatorFullStatsEnabled(); 435 436 // An object that does the underlying suballoc/free of memory for a higher-level 437 // allocator. The expectation is that the higher-level allocator is doing some 438 // kind of cache or pool management so that it will call SubAllocator::Alloc and 439 // Free relatively infrequently, compared to the number of times its own 440 // AllocateRaw and Free methods are called. 441 class SubAllocator { 442 public: 443 // Visitor gets called with a pointer to a memory area and its 444 // size in bytes. The index value will be numa_node for a CPU 445 // allocator and GPU id for a GPU allocator. 446 typedef std::function<void(void*, int index, size_t)> Visitor; 447 448 SubAllocator(const std::vector<Visitor>& alloc_visitors, 449 const std::vector<Visitor>& free_visitors); 450 ~SubAllocator()451 virtual ~SubAllocator() {} 452 // Allocates at least num_bytes. Returns actual number of bytes allocated in 453 // bytes_received. The caller can safely use the full bytes_received sized 454 // buffer following the returend pointer. 455 virtual void* Alloc(size_t alignment, size_t num_bytes, 456 size_t* bytes_received) = 0; 457 virtual void Free(void* ptr, size_t num_bytes) = 0; 458 459 // Returns true if the BFC allocator can safely coalesce adjacent regions 460 // returned by this allocator. 461 virtual bool SupportsCoalescing() const = 0; 462 463 protected: 464 // Implementation of Alloc() method must call this on newly allocated 465 // value. 466 void VisitAlloc(void* ptr, int index, size_t num_bytes); 467 468 // Implementation of Free() method must call this on value to be 469 // freed immediately before deallocation. 470 void VisitFree(void* ptr, int index, size_t num_bytes); 471 472 const std::vector<Visitor> alloc_visitors_; 473 const std::vector<Visitor> free_visitors_; 474 }; 475 476 } // namespace tensorflow 477 478 #endif // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 479