1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 18 19 #include <stdlib.h> 20 21 #include <functional> 22 #include <limits> 23 24 #include "absl/strings/string_view.h" 25 #include "absl/types/optional.h" 26 #include "tensorflow/core/framework/numeric_types.h" 27 #include "tensorflow/core/framework/type_traits.h" 28 #include "tensorflow/core/platform/logging.h" 29 #include "tensorflow/core/platform/macros.h" 30 #include "tensorflow/core/platform/numa.h" 31 #include "tensorflow/core/platform/types.h" 32 33 namespace tensorflow { 34 35 class TensorShape; 36 37 // Attributes for a single allocation call. Different calls to the same 38 // allocator could potentially have different allocation attributes. 39 struct AllocationAttributes { 40 AllocationAttributes() = default; 41 AllocationAttributesAllocationAttributes42 AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged, 43 std::function<uint64()>* freed_by_func) 44 : retry_on_failure(retry_on_failure), 45 allocation_will_be_logged(allocation_will_be_logged), 46 freed_by_func(freed_by_func) {} 47 48 // If the first attempt to allocate the memory fails, the allocation should 49 // wait and retry (with a timeout). 50 // 51 // This is usually set to true, but we may set it to false in cases where a 52 // failure has only performance impact (e.g. optional scratch space 53 // allocation). 54 bool retry_on_failure = true; 55 // If a Tensor is allocated without the following set to true, then 56 // it is logged as an unknown allocation. During execution Tensors 57 // should be allocated through the OpKernelContext which records 58 // which Op is performing the allocation, and sets this flag to 59 // true. 60 bool allocation_will_be_logged = false; 61 // EXPERIMENTAL: If provided, then evaluates to a timing count such that only 62 // a memory chunk whose freed_at_count is at this value or earlier may be 63 // returned. 64 std::function<uint64()>* freed_by_func = nullptr; // Not owned. 65 66 TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes); 67 }; 68 69 // Annotations for memory profiling and debugging purpose. The runtime will 70 // cache the annotations in thread-local memory, and some allocators will try to 71 // tag allocations with the annotations. 72 struct MemoryDebugAnnotation { 73 const char* pending_op_name = nullptr; 74 int64 pending_step_id = 0; 75 const char* pending_region_type = nullptr; 76 int32 pending_data_type = 0; 77 const TensorShape* pending_shape = nullptr; 78 }; 79 80 // Wrapper class of MemoryDebugAnnotation for RAII. 81 class ScopedMemoryDebugAnnotation { 82 public: CurrentAnnotation()83 static const MemoryDebugAnnotation& CurrentAnnotation() { 84 return annotation_; 85 } 86 ScopedMemoryDebugAnnotation(const char * op_name)87 explicit ScopedMemoryDebugAnnotation(const char* op_name) { 88 last_annotation_ = annotation_; 89 CleanupAnnotation(); 90 annotation_.pending_op_name = op_name; 91 } 92 ScopedMemoryDebugAnnotation(const char * op_name,int64 step_id)93 explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id) { 94 last_annotation_ = annotation_; 95 CleanupAnnotation(); 96 annotation_.pending_op_name = op_name; 97 annotation_.pending_step_id = step_id; 98 } 99 100 // This constructor keeps the pending_op_name and pending_step_id from parent 101 // (if any). Otherwise it overwrites with op_name. ScopedMemoryDebugAnnotation(const char * op_name,const char * region_type,int32 data_type,const TensorShape * shape)102 explicit ScopedMemoryDebugAnnotation(const char* op_name, 103 const char* region_type, int32 data_type, 104 const TensorShape* shape) { 105 last_annotation_ = annotation_; 106 if (!annotation_.pending_op_name) { 107 annotation_.pending_op_name = op_name; 108 } 109 annotation_.pending_region_type = region_type; 110 annotation_.pending_data_type = data_type; 111 annotation_.pending_shape = shape; 112 } 113 ScopedMemoryDebugAnnotation(const char * op_name,int64 step_id,const char * region_type,int32 data_type,const TensorShape * shape)114 explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id, 115 const char* region_type, int32 data_type, 116 const TensorShape* shape) { 117 last_annotation_ = annotation_; 118 annotation_.pending_op_name = op_name; 119 annotation_.pending_step_id = step_id; 120 annotation_.pending_region_type = region_type; 121 annotation_.pending_data_type = data_type; 122 annotation_.pending_shape = shape; 123 } 124 ~ScopedMemoryDebugAnnotation()125 ~ScopedMemoryDebugAnnotation() { annotation_ = last_annotation_; } 126 127 private: CleanupAnnotation()128 void CleanupAnnotation() { 129 annotation_.pending_op_name = nullptr; 130 annotation_.pending_step_id = 0; 131 annotation_.pending_region_type = nullptr; 132 annotation_.pending_data_type = 0; 133 annotation_.pending_shape = nullptr; 134 } 135 136 // Stores the current annotations. 137 static thread_local MemoryDebugAnnotation annotation_; 138 139 // Stores the previous values in case the annotations are nested. 140 MemoryDebugAnnotation last_annotation_; 141 142 TF_DISALLOW_COPY_AND_ASSIGN(ScopedMemoryDebugAnnotation); 143 }; 144 145 // Runtime statistics collected by an allocator. Exactly the same as 146 // stream_executor::AllocatorStats, but independently defined to preserve the 147 // mutual independence of StreamExecutor and TensorFlow. 148 struct AllocatorStats { 149 int64 num_allocs; // Number of allocations. 150 int64 bytes_in_use; // Number of bytes in use. 151 int64 peak_bytes_in_use; // The peak bytes in use. 152 int64 largest_alloc_size; // The largest single allocation seen. 153 154 // The upper limit of bytes of user allocatable device memory, if such a limit 155 // is known. 156 absl::optional<int64> bytes_limit; 157 158 // Stats for reserved memory usage. 159 int64 bytes_reserved; // Number of bytes reserved. 160 int64 peak_bytes_reserved; // The peak number of bytes reserved. 161 // The upper limit on the number bytes of reservable memory, 162 // if such a limit is known. 163 absl::optional<int64> bytes_reservable_limit; 164 165 int64 largest_free_block_bytes; // Largest free block's size in heap. 166 AllocatorStatsAllocatorStats167 AllocatorStats() 168 : num_allocs(0), 169 bytes_in_use(0), 170 peak_bytes_in_use(0), 171 largest_alloc_size(0), 172 bytes_reserved(0), 173 peak_bytes_reserved(0), 174 largest_free_block_bytes(0) {} 175 176 std::string DebugString() const; 177 }; 178 179 // Allocator is an abstract interface for allocating and deallocating 180 // device memory. 181 class Allocator { 182 public: 183 // Align to 64 byte boundary. 184 static constexpr size_t kAllocatorAlignment = 64; 185 186 virtual ~Allocator(); 187 188 // Return a string identifying this allocator 189 virtual std::string Name() = 0; 190 191 // Return an uninitialized block of memory that is "num_bytes" bytes 192 // in size. The returned pointer is guaranteed to be aligned to a 193 // multiple of "alignment" bytes. 194 // REQUIRES: "alignment" is a power of 2. 195 virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0; 196 197 // Return an uninitialized block of memory that is "num_bytes" bytes 198 // in size with specified allocation attributes. The returned pointer is 199 // guaranteed to be aligned to a multiple of "alignment" bytes. 200 // REQUIRES: "alignment" is a power of 2. AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)201 virtual void* AllocateRaw(size_t alignment, size_t num_bytes, 202 const AllocationAttributes& allocation_attr) { 203 // The default behavior is to use the implementation without any allocation 204 // attributes. 205 return AllocateRaw(alignment, num_bytes); 206 } 207 208 // Deallocate a block of memory pointer to by "ptr" 209 // REQUIRES: "ptr" was previously returned by a call to AllocateRaw 210 virtual void DeallocateRaw(void* ptr) = 0; 211 212 // Returns true if this allocator tracks the sizes of allocations. 213 // RequestedSize and AllocatedSize must be overridden if 214 // TracksAllocationSizes is overridden to return true. TracksAllocationSizes()215 virtual bool TracksAllocationSizes() const { return false; } 216 217 // Returns true if this allocator allocates an opaque handle rather than the 218 // requested number of bytes. 219 // 220 // This method returns false for most allocators, but may be used by 221 // special-case allocators that track tensor usage. If this method returns 222 // true, AllocateRaw() should be invoked for all values of `num_bytes`, 223 // including 0. 224 // 225 // NOTE: It is the caller's responsibility to track whether an allocated 226 // object is a buffer or an opaque handle. In particular, when this method 227 // returns `true`, users of this allocator must not run any constructors or 228 // destructors for complex objects, since there is no backing store for the 229 // tensor in which to place their outputs. AllocatesOpaqueHandle()230 virtual bool AllocatesOpaqueHandle() const { return false; } 231 232 // Returns the user-requested size of the data allocated at 233 // 'ptr'. Note that the actual buffer allocated might be larger 234 // than requested, but this function returns the size requested by 235 // the user. 236 // 237 // REQUIRES: TracksAllocationSizes() is true. 238 // 239 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 240 // allocated by this allocator. RequestedSize(const void * ptr)241 virtual size_t RequestedSize(const void* ptr) const { 242 CHECK(false) << "allocator doesn't track sizes"; 243 return size_t(0); 244 } 245 246 // Returns the allocated size of the buffer at 'ptr' if known, 247 // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is 248 // guaranteed to be >= RequestedSize(ptr). 249 // 250 // REQUIRES: TracksAllocationSizes() is true. 251 // 252 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 253 // allocated by this allocator. AllocatedSize(const void * ptr)254 virtual size_t AllocatedSize(const void* ptr) const { 255 return RequestedSize(ptr); 256 } 257 258 // Returns either 0 or an identifier assigned to the buffer at 'ptr' 259 // when the buffer was returned by AllocateRaw. If non-zero, the 260 // identifier differs from every other ID assigned by this 261 // allocator. 262 // 263 // REQUIRES: TracksAllocationSizes() is true. 264 // 265 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 266 // allocated by this allocator. AllocationId(const void * ptr)267 virtual int64 AllocationId(const void* ptr) const { return 0; } 268 269 // Returns the allocated size of the buffer at 'ptr' if known, 270 // otherwise returns 0. This method can be called when 271 // TracksAllocationSizes() is false, but can be extremely slow. 272 // 273 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 274 // allocated by this allocator. AllocatedSizeSlow(const void * ptr)275 virtual size_t AllocatedSizeSlow(const void* ptr) const { 276 if (TracksAllocationSizes()) { 277 return AllocatedSize(ptr); 278 } 279 return 0; 280 } 281 282 // Fills in 'stats' with statistics collected by this allocator. GetStats()283 virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; } 284 285 // Clears the internal stats except for the `in_use` field. ClearStats()286 virtual void ClearStats() {} 287 SetSafeFrontier(uint64 count)288 virtual void SetSafeFrontier(uint64 count) {} 289 }; 290 291 // An implementation of Allocator that delegates all calls to another Allocator. 292 // 293 // Useful to clients who want to override part of the functionality of another 294 // allocator. 295 class AllocatorWrapper : public Allocator { 296 public: AllocatorWrapper(Allocator * wrapped)297 explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {} 298 ~AllocatorWrapper()299 ~AllocatorWrapper() override {} 300 301 // Returns the wrapped allocator to which all calls are delegated. wrapped()302 Allocator* wrapped() const { return wrapped_; } 303 Name()304 std::string Name() override { return wrapped_->Name(); } 305 AllocateRaw(size_t alignment,size_t num_bytes)306 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 307 return wrapped_->AllocateRaw(alignment, num_bytes); 308 } 309 AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)310 void* AllocateRaw(size_t alignment, size_t num_bytes, 311 const AllocationAttributes& allocation_attr) override { 312 return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr); 313 } 314 DeallocateRaw(void * ptr)315 void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); } 316 TracksAllocationSizes()317 bool TracksAllocationSizes() const override { 318 return wrapped_->TracksAllocationSizes(); 319 } 320 AllocatesOpaqueHandle()321 bool AllocatesOpaqueHandle() const override { 322 return wrapped_->AllocatesOpaqueHandle(); 323 } 324 RequestedSize(const void * ptr)325 size_t RequestedSize(const void* ptr) const override { 326 return wrapped_->RequestedSize(ptr); 327 } 328 AllocatedSize(const void * ptr)329 size_t AllocatedSize(const void* ptr) const override { 330 return wrapped_->AllocatedSize(ptr); 331 } 332 AllocationId(const void * ptr)333 int64 AllocationId(const void* ptr) const override { 334 return wrapped_->AllocationId(ptr); 335 } 336 AllocatedSizeSlow(const void * ptr)337 size_t AllocatedSizeSlow(const void* ptr) const override { 338 return wrapped_->AllocatedSizeSlow(ptr); 339 } 340 341 private: 342 Allocator* const wrapped_; 343 }; 344 345 // A tensorflow Op may need access to different kinds of memory that 346 // are not simply a function of the device to which the Op has been 347 // assigned. For example, an Op executing on a GPU may still need 348 // to allocate CPU RAM for some purpose. Internal to the tensorflow 349 // runtime we may choose to allocate CPU ram from special regions 350 // that have been prepared for higher performance in some use 351 // contexts, e.g. doing DMA with particular devices. For these 352 // reasons, the Device interface does not expose just one memory 353 // Allocator, but instead provides an accessor that takes a 354 // specification of the desired memory attributes in order to select 355 // an Allocator. 356 // 357 // Example use: 358 // // Allocator for ordinary device memory: 359 // Allocator* a = allocator(AllocatorAttributes()); 360 // ... 361 // // Allocator for CPU RAM, regardless of where Op is executing: 362 // AllocatorAttributes attr; 363 // attr.set_on_host(true); 364 // Allocator* a = allocator(attr); 365 struct AllocatorAttributes { set_on_hostAllocatorAttributes366 void set_on_host(bool v) { value |= (static_cast<int>(v)); } on_hostAllocatorAttributes367 bool on_host() const { return value & 0x1; } set_nic_compatibleAllocatorAttributes368 void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); } nic_compatibleAllocatorAttributes369 bool nic_compatible() const { return value & (0x1 << 1); } set_gpu_compatibleAllocatorAttributes370 void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); } gpu_compatibleAllocatorAttributes371 bool gpu_compatible() const { return value & (0x1 << 2); } MergeAllocatorAttributes372 void Merge(AllocatorAttributes other) { 373 value |= other.value; 374 if (scope_id != other.scope_id) { 375 CHECK(scope_id == 0 || other.scope_id == 0) 376 << "At least one scope_id should be zero to merge " 377 "AllocatorAttributes but found this.scope_id=" 378 << scope_id << " and other.scope_id=" << other.scope_id; 379 scope_id = scope_id == 0 ? other.scope_id : scope_id; 380 } 381 } 382 // Returns true if the fields set in *this is a subset of or equal to 383 // those set in other. IsEqualOrLessRestrictiveThanAllocatorAttributes384 bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const { 385 return (value | other.value) == other.value; 386 } 387 388 // NOTE: The upper 8 bits of the value are reserved for 389 // device-specific uses. Implementors of a device can interpret these 390 // upper 8 bits in device-specific ways, and ops implemented for those 391 // devices are responsible for setting those 8 bits appropriately. 392 uint32 value = 0; 393 // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to 394 // a named special-purpose allocator on the same device. 395 int32 scope_id = 0; 396 397 // Returns a human readable representation of this. 398 std::string DebugString() const; 399 }; 400 401 // Returns a trivial implementation of Allocator, which is a process singleton. 402 // Access through this function is only intended for use by restricted parts 403 // of the infrastructure. 404 Allocator* cpu_allocator_base(); 405 406 // If available, calls ProcessState::GetCPUAllocator(numa_node). 407 // If not, falls back to cpu_allocator_base(). 408 // Intended for use in contexts where ProcessState is not visible at 409 // compile time. Where ProcessState is visible, it's preferable to 410 // call it directly. 411 Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity); 412 413 // Enables AllocatorStats in the default CPU allocator implementation. By 414 // default, it's disabled. 415 void EnableCPUAllocatorStats(); 416 // Disables AllocatorStats in the default CPU allocator implementation. By 417 // default, it's disabled. 418 void DisableCPUAllocatorStats(); 419 bool CPUAllocatorStatsEnabled(); 420 421 // Enables full statistics collection in the default CPU allocator 422 // implementation. By default, it's disabled. 423 void EnableCPUAllocatorFullStats(); 424 bool CPUAllocatorFullStatsEnabled(); 425 426 // An object that does the underlying suballoc/free of memory for a higher-level 427 // allocator. The expectation is that the higher-level allocator is doing some 428 // kind of cache or pool management so that it will call SubAllocator::Alloc and 429 // Free relatively infrequently, compared to the number of times its own 430 // AllocateRaw and Free methods are called. 431 class SubAllocator { 432 public: 433 // Visitor gets called with a pointer to a memory area and its 434 // size in bytes. The index value will be numa_node for a CPU 435 // allocator and GPU id for a GPU allocator. 436 typedef std::function<void(void*, int index, size_t)> Visitor; 437 438 SubAllocator(const std::vector<Visitor>& alloc_visitors, 439 const std::vector<Visitor>& free_visitors); 440 ~SubAllocator()441 virtual ~SubAllocator() {} 442 // Allocates at least num_bytes. Returns actual number of bytes allocated in 443 // bytes_received. The caller can safely use the full bytes_received sized 444 // buffer following the returend pointer. 445 virtual void* Alloc(size_t alignment, size_t num_bytes, 446 size_t* bytes_received) = 0; 447 virtual void Free(void* ptr, size_t num_bytes) = 0; 448 449 // Returns true if the BFC allocator can safely coalesce adjacent regions 450 // returned by this allocator. 451 virtual bool SupportsCoalescing() const = 0; 452 453 protected: 454 // Implementation of Alloc() method must call this on newly allocated 455 // value. 456 void VisitAlloc(void* ptr, int index, size_t num_bytes); 457 458 // Implementation of Free() method must call this on value to be 459 // freed immediately before deallocation. 460 void VisitFree(void* ptr, int index, size_t num_bytes); 461 462 const std::vector<Visitor> alloc_visitors_; 463 const std::vector<Visitor> free_visitors_; 464 }; 465 466 } // namespace tensorflow 467 468 #endif // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 469