1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 18 19 #include <stdlib.h> 20 21 #include <functional> 22 #include <limits> 23 24 #include "absl/strings/string_view.h" 25 #include "absl/types/optional.h" 26 #include "tensorflow/core/framework/numeric_types.h" 27 #include "tensorflow/core/framework/type_traits.h" 28 #include "tensorflow/core/platform/logging.h" 29 #include "tensorflow/core/platform/macros.h" 30 #include "tensorflow/core/platform/numa.h" 31 #include "tensorflow/core/platform/types.h" 32 33 namespace tensorflow { 34 35 // Attributes for a single allocation call. Different calls to the same 36 // allocator could potentially have different allocation attributes. 37 struct AllocationAttributes { 38 AllocationAttributes() = default; 39 AllocationAttributesAllocationAttributes40 AllocationAttributes(bool no_retry_on_failure, bool allocation_will_be_logged, 41 std::function<uint64()>* freed_by_func) 42 : no_retry_on_failure(no_retry_on_failure), 43 allocation_will_be_logged(allocation_will_be_logged), 44 freed_by_func(freed_by_func) {} 45 46 // If the first attempt to allocate the memory fails, the allocation 47 // should return immediately without retrying. 48 // An example use case is optional scratch spaces where a failure 49 // has only performance impact. 50 bool no_retry_on_failure = false; 51 // If a Tensor is allocated without the following set to true, then 52 // it is logged as an unknown allocation. During execution Tensors 53 // should be allocated through the OpKernelContext which records 54 // which Op is performing the allocation, and sets this flag to 55 // true. 56 bool allocation_will_be_logged = false; 57 // EXPERIMENTAL: If provided, then evaluates to a timing count such that only 58 // a memory chunk whose freed_at_count is at this value or earlier may be 59 // returned. 60 std::function<uint64()>* freed_by_func = nullptr; // Not owned. 61 62 TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes); 63 }; 64 65 // If defined, the runtime will cache Op names in thread-local memory 66 // and some allocators will try to tag allocations with the requesting Op. 67 #ifdef TENSORFLOW_MEM_DEBUG 68 extern thread_local const char* pending_op_name; 69 extern thread_local uint64 pending_step_id; 70 #define MEMDEBUG_CACHE_OP(N) \ 71 do { \ 72 pending_op_name = (N); \ 73 } while (0) 74 #define MEMDEBUG_CACHE_STEPID(N) \ 75 do { \ 76 pending_step_id = (N); \ 77 } while (0) 78 #define MEMDEBUG_CACHE_VAL pending_op_name 79 #else 80 #define MEMDEBUG_CACHE_OP(N) \ 81 do { \ 82 } while (0) 83 #define MEMDEBUG_CACHE_STEPID(N) \ 84 do { \ 85 } while (0) 86 #define MEMDEBUG_CACHE_VAL nullptr 87 #endif 88 89 // Runtime statistics collected by an allocator. Exactly the same as 90 // stream_executor::AllocatorStats, but independently defined to preserve the 91 // mutual independence of StreamExecutor and TensorFlow. 92 struct AllocatorStats { 93 int64 num_allocs; // Number of allocations. 94 int64 bytes_in_use; // Number of bytes in use. 95 int64 peak_bytes_in_use; // The peak bytes in use. 96 int64 largest_alloc_size; // The largest single allocation seen. 97 98 // The upper limit of bytes of user allocatable device memory, if such a limit 99 // is known. 100 absl::optional<int64> bytes_limit; 101 102 // Stats for reserved memory usage. 103 int64 bytes_reserved; // Number of bytes reserved. 104 int64 peak_bytes_reserved; // The peak number of bytes reserved. 105 // The upper limit on the number bytes of reservable memory, 106 // if such a limit is known. 107 absl::optional<int64> bytes_reservable_limit; 108 AllocatorStatsAllocatorStats109 AllocatorStats() 110 : num_allocs(0), 111 bytes_in_use(0), 112 peak_bytes_in_use(0), 113 largest_alloc_size(0), 114 bytes_reserved(0), 115 peak_bytes_reserved(0) {} 116 117 string DebugString() const; 118 }; 119 120 // Allocator is an abstract interface for allocating and deallocating 121 // device memory. 122 class Allocator { 123 public: 124 // Align to 64 byte boundary. 125 static constexpr size_t kAllocatorAlignment = 64; 126 127 virtual ~Allocator(); 128 129 // Return a string identifying this allocator 130 virtual string Name() = 0; 131 132 // Return an uninitialized block of memory that is "num_bytes" bytes 133 // in size. The returned pointer is guaranteed to be aligned to a 134 // multiple of "alignment" bytes. 135 // REQUIRES: "alignment" is a power of 2. 136 virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0; 137 138 // Return an uninitialized block of memory that is "num_bytes" bytes 139 // in size with specified allocation attributes. The returned pointer is 140 // guaranteed to be aligned to a multiple of "alignment" bytes. 141 // REQUIRES: "alignment" is a power of 2. AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)142 virtual void* AllocateRaw(size_t alignment, size_t num_bytes, 143 const AllocationAttributes& allocation_attr) { 144 // The default behavior is to use the implementation without any allocation 145 // attributes. 146 return AllocateRaw(alignment, num_bytes); 147 } 148 149 // Deallocate a block of memory pointer to by "ptr" 150 // REQUIRES: "ptr" was previously returned by a call to AllocateRaw 151 virtual void DeallocateRaw(void* ptr) = 0; 152 153 // Returns true if this allocator tracks the sizes of allocations. 154 // RequestedSize and AllocatedSize must be overridden if 155 // TracksAllocationSizes is overridden to return true. TracksAllocationSizes()156 virtual bool TracksAllocationSizes() const { return false; } 157 158 // Returns true if this allocator allocates an opaque handle rather than the 159 // requested number of bytes. 160 // 161 // This method returns false for most allocators, but may be used by 162 // special-case allocators that track tensor usage. If this method returns 163 // true, AllocateRaw() should be invoked for all values of `num_bytes`, 164 // including 0. 165 // 166 // NOTE: It is the caller's responsibility to track whether an allocated 167 // object is a buffer or an opaque handle. In particular, when this method 168 // returns `true`, users of this allocator must not run any constructors or 169 // destructors for complex objects, since there is no backing store for the 170 // tensor in which to place their outputs. AllocatesOpaqueHandle()171 virtual bool AllocatesOpaqueHandle() const { return false; } 172 173 // Returns the user-requested size of the data allocated at 174 // 'ptr'. Note that the actual buffer allocated might be larger 175 // than requested, but this function returns the size requested by 176 // the user. 177 // 178 // REQUIRES: TracksAllocationSizes() is true. 179 // 180 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 181 // allocated by this allocator. RequestedSize(const void * ptr)182 virtual size_t RequestedSize(const void* ptr) const { 183 CHECK(false) << "allocator doesn't track sizes"; 184 return size_t(0); 185 } 186 187 // Returns the allocated size of the buffer at 'ptr' if known, 188 // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is 189 // guaranteed to be >= RequestedSize(ptr). 190 // 191 // REQUIRES: TracksAllocationSizes() is true. 192 // 193 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 194 // allocated by this allocator. AllocatedSize(const void * ptr)195 virtual size_t AllocatedSize(const void* ptr) const { 196 return RequestedSize(ptr); 197 } 198 199 // Returns either 0 or an identifier assigned to the buffer at 'ptr' 200 // when the buffer was returned by AllocateRaw. If non-zero, the 201 // identifier differs from every other ID assigned by this 202 // allocator. 203 // 204 // REQUIRES: TracksAllocationSizes() is true. 205 // 206 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 207 // allocated by this allocator. AllocationId(const void * ptr)208 virtual int64 AllocationId(const void* ptr) const { return 0; } 209 210 // Returns the allocated size of the buffer at 'ptr' if known, 211 // otherwise returns 0. This method can be called when 212 // TracksAllocationSizes() is false, but can be extremely slow. 213 // 214 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 215 // allocated by this allocator. AllocatedSizeSlow(const void * ptr)216 virtual size_t AllocatedSizeSlow(const void* ptr) const { 217 if (TracksAllocationSizes()) { 218 return AllocatedSize(ptr); 219 } 220 return 0; 221 } 222 223 // Fills in 'stats' with statistics collected by this allocator. GetStats()224 virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; } 225 226 // Clears the internal stats except for the `in_use` field. ClearStats()227 virtual void ClearStats() {} 228 SetSafeFrontier(uint64 count)229 virtual void SetSafeFrontier(uint64 count) {} 230 }; 231 232 // An implementation of Allocator that delegates all calls to another Allocator. 233 // 234 // Useful to clients who want to override part of the functionality of another 235 // allocator. 236 class AllocatorWrapper : public Allocator { 237 public: AllocatorWrapper(Allocator * wrapped)238 explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {} 239 ~AllocatorWrapper()240 ~AllocatorWrapper() override {} 241 242 // Returns the wrapped allocator to which all calls are delegated. wrapped()243 Allocator* wrapped() const { return wrapped_; } 244 Name()245 string Name() override { return wrapped_->Name(); } 246 AllocateRaw(size_t alignment,size_t num_bytes)247 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 248 return wrapped_->AllocateRaw(alignment, num_bytes); 249 } 250 AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)251 void* AllocateRaw(size_t alignment, size_t num_bytes, 252 const AllocationAttributes& allocation_attr) override { 253 return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr); 254 } 255 DeallocateRaw(void * ptr)256 void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); } 257 TracksAllocationSizes()258 bool TracksAllocationSizes() const override { 259 return wrapped_->TracksAllocationSizes(); 260 } 261 AllocatesOpaqueHandle()262 bool AllocatesOpaqueHandle() const override { 263 return wrapped_->AllocatesOpaqueHandle(); 264 } 265 RequestedSize(const void * ptr)266 size_t RequestedSize(const void* ptr) const override { 267 return wrapped_->RequestedSize(ptr); 268 } 269 AllocatedSize(const void * ptr)270 size_t AllocatedSize(const void* ptr) const override { 271 return wrapped_->AllocatedSize(ptr); 272 } 273 AllocationId(const void * ptr)274 int64 AllocationId(const void* ptr) const override { 275 return wrapped_->AllocationId(ptr); 276 } 277 AllocatedSizeSlow(const void * ptr)278 size_t AllocatedSizeSlow(const void* ptr) const override { 279 return wrapped_->AllocatedSizeSlow(ptr); 280 } 281 282 private: 283 Allocator* const wrapped_; 284 }; 285 286 // A tensorflow Op may need access to different kinds of memory that 287 // are not simply a function of the device to which the Op has been 288 // assigned. For example, an Op executing on a GPU may still need 289 // to allocate CPU RAM for some purpose. Internal to the tensorflow 290 // runtime we may choose to allocate CPU ram from special regions 291 // that have been prepared for higher performance in some use 292 // contexts, e.g. doing DMA with particular devices. For these 293 // reasons, the Device interface does not expose just one memory 294 // Allocator, but instead provides an accessor that takes a 295 // specification of the desired memory attributes in order to select 296 // an Allocator. 297 // 298 // Example use: 299 // // Allocator for ordinary device memory: 300 // Allocator* a = allocator(AllocatorAttributes()); 301 // ... 302 // // Allocator for CPU RAM, regardless of where Op is executing: 303 // AllocatorAttributes attr; 304 // attr.set_on_host(true); 305 // Allocator* a = allocator(attr); 306 struct AllocatorAttributes { set_on_hostAllocatorAttributes307 void set_on_host(bool v) { value |= (static_cast<int>(v)); } on_hostAllocatorAttributes308 bool on_host() const { return value & 0x1; } set_nic_compatibleAllocatorAttributes309 void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); } nic_compatibleAllocatorAttributes310 bool nic_compatible() const { return value & (0x1 << 1); } set_gpu_compatibleAllocatorAttributes311 void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); } gpu_compatibleAllocatorAttributes312 bool gpu_compatible() const { return value & (0x1 << 2); } MergeAllocatorAttributes313 void Merge(AllocatorAttributes other) { 314 value |= other.value; 315 if (scope_id != other.scope_id) { 316 CHECK(scope_id == 0 || other.scope_id == 0) 317 << "At least one scope_id should be zero to merge " 318 "AllocatorAttributes but found this.scope_id=" 319 << scope_id << " and other.scope_id=" << other.scope_id; 320 scope_id = scope_id == 0 ? other.scope_id : scope_id; 321 } 322 } 323 // Returns true if the fields set in *this is a subset of or equal to 324 // those set in other. IsEqualOrLessRestrictiveThanAllocatorAttributes325 bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const { 326 return (value | other.value) == other.value; 327 } 328 329 // NOTE: The upper 8 bits of the value are reserved for 330 // device-specific uses. Implementors of a device can interpret these 331 // upper 8 bits in device-specific ways, and ops implemented for those 332 // devices are responsible for setting those 8 bits appropriately. 333 uint32 value = 0; 334 // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to 335 // a named special-purpose allocator on the same device. 336 int32 scope_id = 0; 337 338 // Returns a human readable representation of this. 339 string DebugString() const; 340 }; 341 342 // Returns a trivial implementation of Allocator, which is a process singleton. 343 // Access through this function is only intended for use by restricted parts 344 // of the infrastructure. 345 Allocator* cpu_allocator_base(); 346 347 // If available, calls ProcessState::GetCPUAllocator(numa_node). 348 // If not, falls back to cpu_allocator_base(). 349 // Intended for use in contexts where ProcessState is not visible at 350 // compile time. Where ProcessState is visible, it's preferable to 351 // call it directly. 352 Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity); 353 354 // If 'enable' is true, the default CPU allocator implementation will collect 355 // AllocatorStats. By default, it's disabled. 356 void EnableCPUAllocatorStats(bool enable); 357 bool CPUAllocatorStatsEnabled(); 358 359 // If 'enable' is true, the default CPU allocator implementation will collect 360 // full statistics. By default, it's disabled. 361 void EnableCPUAllocatorFullStats(bool enable); 362 bool CPUAllocatorFullStatsEnabled(); 363 364 // An object that does the underlying suballoc/free of memory for a higher-level 365 // allocator. The expectation is that the higher-level allocator is doing some 366 // kind of cache or pool management so that it will call SubAllocator::Alloc and 367 // Free relatively infrequently, compared to the number of times its own 368 // AllocateRaw and Free methods are called. 369 class SubAllocator { 370 public: 371 // Visitor gets called with a pointer to a memory area and its 372 // size in bytes. The index value will be numa_node for a CPU 373 // allocator and GPU id for a GPU allocator. 374 typedef std::function<void(void*, int index, size_t)> Visitor; 375 376 SubAllocator(const std::vector<Visitor>& alloc_visitors, 377 const std::vector<Visitor>& free_visitors); 378 ~SubAllocator()379 virtual ~SubAllocator() {} 380 virtual void* Alloc(size_t alignment, size_t num_bytes) = 0; 381 virtual void Free(void* ptr, size_t num_bytes) = 0; 382 383 protected: 384 // Implementation of Alloc() method must call this on newly allocated 385 // value. 386 void VisitAlloc(void* ptr, int index, size_t num_bytes); 387 388 // Implementation of Free() method must call this on value to be 389 // freed immediately before deallocation. 390 void VisitFree(void* ptr, int index, size_t num_bytes); 391 392 const std::vector<Visitor> alloc_visitors_; 393 const std::vector<Visitor> free_visitors_; 394 }; 395 396 } // namespace tensorflow 397 398 #endif // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_ 399