• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
17 #define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
18 
19 #include <stdlib.h>
20 
21 #include <functional>
22 #include <limits>
23 
24 #include "absl/strings/string_view.h"
25 #include "absl/types/optional.h"
26 #include "tensorflow/core/framework/numeric_types.h"
27 #include "tensorflow/core/framework/type_traits.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/macros.h"
30 #include "tensorflow/core/platform/numa.h"
31 #include "tensorflow/core/platform/types.h"
32 
33 namespace tensorflow {
34 
35 class TensorShape;
36 
37 // Attributes for a single allocation call. Different calls to the same
38 // allocator could potentially have different allocation attributes.
39 struct AllocationAttributes {
40   AllocationAttributes() = default;
41 
AllocationAttributesAllocationAttributes42   AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged,
43                        std::function<uint64()>* freed_by_func)
44       : retry_on_failure(retry_on_failure),
45         allocation_will_be_logged(allocation_will_be_logged),
46         freed_by_func(freed_by_func) {}
47 
48   // If the first attempt to allocate the memory fails, the allocation should
49   // wait and retry (with a timeout).
50   //
51   // This is usually set to true, but we may set it to false in cases where a
52   // failure has only performance impact (e.g. optional scratch space
53   // allocation).
54   bool retry_on_failure = true;
55   // If a Tensor is allocated without the following set to true, then
56   // it is logged as an unknown allocation. During execution Tensors
57   // should be allocated through the OpKernelContext which records
58   // which Op is performing the allocation, and sets this flag to
59   // true.
60   bool allocation_will_be_logged = false;
61   // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
62   // a memory chunk whose freed_at_count is at this value or earlier may be
63   // returned.
64   std::function<uint64()>* freed_by_func = nullptr;  // Not owned.
65 
66   TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes);
67 };
68 
69 // Annotations for memory profiling and debugging purpose. The runtime will
70 // cache the annotations in thread-local memory, and some allocators will try to
71 // tag allocations with the annotations.
72 struct MemoryDebugAnnotation {
73   const char* pending_op_name = nullptr;
74   int64 pending_step_id = 0;
75   const char* pending_region_type = nullptr;
76   int32 pending_data_type = 0;
77   const TensorShape* pending_shape = nullptr;
78 };
79 
80 // Wrapper class of MemoryDebugAnnotation for RAII.
81 class ScopedMemoryDebugAnnotation {
82  public:
CurrentAnnotation()83   static const MemoryDebugAnnotation& CurrentAnnotation() {
84     return annotation_;
85   }
86 
ScopedMemoryDebugAnnotation(const char * op_name)87   explicit ScopedMemoryDebugAnnotation(const char* op_name) {
88     last_annotation_ = annotation_;
89     CleanupAnnotation();
90     annotation_.pending_op_name = op_name;
91   }
92 
ScopedMemoryDebugAnnotation(const char * op_name,int64_t step_id)93   explicit ScopedMemoryDebugAnnotation(const char* op_name, int64_t step_id) {
94     last_annotation_ = annotation_;
95     CleanupAnnotation();
96     annotation_.pending_op_name = op_name;
97     annotation_.pending_step_id = step_id;
98   }
99 
100   // This constructor keeps the pending_op_name and pending_step_id from parent
101   // (if any).  Otherwise it overwrites with op_name.
ScopedMemoryDebugAnnotation(const char * op_name,const char * region_type,int32_t data_type,const TensorShape * shape)102   explicit ScopedMemoryDebugAnnotation(const char* op_name,
103                                        const char* region_type,
104                                        int32_t data_type,
105                                        const TensorShape* shape) {
106     last_annotation_ = annotation_;
107     if (!annotation_.pending_op_name) {
108       annotation_.pending_op_name = op_name;
109     }
110     annotation_.pending_region_type = region_type;
111     annotation_.pending_data_type = data_type;
112     annotation_.pending_shape = shape;
113   }
114 
ScopedMemoryDebugAnnotation(const char * op_name,int64_t step_id,const char * region_type,int32_t data_type,const TensorShape * shape)115   explicit ScopedMemoryDebugAnnotation(const char* op_name, int64_t step_id,
116                                        const char* region_type,
117                                        int32_t data_type,
118                                        const TensorShape* shape) {
119     last_annotation_ = annotation_;
120     annotation_.pending_op_name = op_name;
121     annotation_.pending_step_id = step_id;
122     annotation_.pending_region_type = region_type;
123     annotation_.pending_data_type = data_type;
124     annotation_.pending_shape = shape;
125   }
126 
~ScopedMemoryDebugAnnotation()127   ~ScopedMemoryDebugAnnotation() { annotation_ = last_annotation_; }
128 
129  private:
CleanupAnnotation()130   void CleanupAnnotation() {
131     annotation_.pending_op_name = nullptr;
132     annotation_.pending_step_id = 0;
133     annotation_.pending_region_type = nullptr;
134     annotation_.pending_data_type = 0;
135     annotation_.pending_shape = nullptr;
136   }
137 
138   // Stores the current annotations.
139   static thread_local MemoryDebugAnnotation annotation_;
140 
141   // Stores the previous values in case the annotations are nested.
142   MemoryDebugAnnotation last_annotation_;
143 
144   TF_DISALLOW_COPY_AND_ASSIGN(ScopedMemoryDebugAnnotation);
145 };
146 
147 // Runtime statistics collected by an allocator. Exactly the same as
148 // stream_executor::AllocatorStats, but independently defined to preserve the
149 // mutual independence of StreamExecutor and TensorFlow.
150 struct AllocatorStats {
151   int64 num_allocs;          // Number of allocations.
152   int64 bytes_in_use;        // Number of bytes in use.
153   int64 peak_bytes_in_use;   // The peak bytes in use.
154   int64 largest_alloc_size;  // The largest single allocation seen.
155 
156   // The upper limit of bytes of user allocatable device memory, if such a limit
157   // is known.
158   absl::optional<int64> bytes_limit;
159 
160   // Stats for reserved memory usage.
161   int64 bytes_reserved;       // Number of bytes reserved.
162   int64 peak_bytes_reserved;  // The peak number of bytes reserved.
163   // The upper limit on the number bytes of reservable memory,
164   // if such a limit is known.
165   absl::optional<int64> bytes_reservable_limit;
166 
167   int64 largest_free_block_bytes;  // Largest free block's size in heap.
168 
AllocatorStatsAllocatorStats169   AllocatorStats()
170       : num_allocs(0),
171         bytes_in_use(0),
172         peak_bytes_in_use(0),
173         largest_alloc_size(0),
174         bytes_reserved(0),
175         peak_bytes_reserved(0),
176         largest_free_block_bytes(0) {}
177 
178   std::string DebugString() const;
179 };
180 
181 // Allocator is an abstract interface for allocating and deallocating
182 // device memory.
183 class Allocator {
184  public:
185   // Align to 64 byte boundary.
186   static constexpr size_t kAllocatorAlignment = 64;
187 
188   virtual ~Allocator();
189 
190   // Return a string identifying this allocator
191   virtual std::string Name() = 0;
192 
193   // Return an uninitialized block of memory that is "num_bytes" bytes
194   // in size.  The returned pointer is guaranteed to be aligned to a
195   // multiple of "alignment" bytes.
196   // REQUIRES: "alignment" is a power of 2.
197   virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0;
198 
199   // Return an uninitialized block of memory that is "num_bytes" bytes
200   // in size with specified allocation attributes.  The returned pointer is
201   // guaranteed to be aligned to a multiple of "alignment" bytes.
202   // REQUIRES: "alignment" is a power of 2.
AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)203   virtual void* AllocateRaw(size_t alignment, size_t num_bytes,
204                             const AllocationAttributes& allocation_attr) {
205     // The default behavior is to use the implementation without any allocation
206     // attributes.
207     return AllocateRaw(alignment, num_bytes);
208   }
209 
210   // Deallocate a block of memory pointer to by "ptr"
211   // REQUIRES: "ptr" was previously returned by a call to AllocateRaw
212   virtual void DeallocateRaw(void* ptr) = 0;
213 
214   // Returns true if this allocator tracks the sizes of allocations.
215   // RequestedSize and AllocatedSize must be overridden if
216   // TracksAllocationSizes is overridden to return true.
TracksAllocationSizes()217   virtual bool TracksAllocationSizes() const { return false; }
218 
219   // Returns true if this allocator allocates an opaque handle rather than the
220   // requested number of bytes.
221   //
222   // This method returns false for most allocators, but may be used by
223   // special-case allocators that track tensor usage. If this method returns
224   // true, AllocateRaw() should be invoked for all values of `num_bytes`,
225   // including 0.
226   //
227   // NOTE: It is the caller's responsibility to track whether an allocated
228   // object is a buffer or an opaque handle. In particular, when this method
229   // returns `true`, users of this allocator must not run any constructors or
230   // destructors for complex objects, since there is no backing store for the
231   // tensor in which to place their outputs.
AllocatesOpaqueHandle()232   virtual bool AllocatesOpaqueHandle() const { return false; }
233 
234   // Returns the user-requested size of the data allocated at
235   // 'ptr'.  Note that the actual buffer allocated might be larger
236   // than requested, but this function returns the size requested by
237   // the user.
238   //
239   // REQUIRES: TracksAllocationSizes() is true.
240   //
241   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
242   // allocated by this allocator.
RequestedSize(const void * ptr)243   virtual size_t RequestedSize(const void* ptr) const {
244     CHECK(false) << "allocator doesn't track sizes";
245     return size_t(0);
246   }
247 
248   // Returns the allocated size of the buffer at 'ptr' if known,
249   // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is
250   // guaranteed to be >= RequestedSize(ptr).
251   //
252   // REQUIRES: TracksAllocationSizes() is true.
253   //
254   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
255   // allocated by this allocator.
AllocatedSize(const void * ptr)256   virtual size_t AllocatedSize(const void* ptr) const {
257     return RequestedSize(ptr);
258   }
259 
260   // Returns either 0 or an identifier assigned to the buffer at 'ptr'
261   // when the buffer was returned by AllocateRaw. If non-zero, the
262   // identifier differs from every other ID assigned by this
263   // allocator.
264   //
265   // REQUIRES: TracksAllocationSizes() is true.
266   //
267   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
268   // allocated by this allocator.
AllocationId(const void * ptr)269   virtual int64 AllocationId(const void* ptr) const { return 0; }
270 
271   // Returns the allocated size of the buffer at 'ptr' if known,
272   // otherwise returns 0. This method can be called when
273   // TracksAllocationSizes() is false, but can be extremely slow.
274   //
275   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
276   // allocated by this allocator.
AllocatedSizeSlow(const void * ptr)277   virtual size_t AllocatedSizeSlow(const void* ptr) const {
278     if (TracksAllocationSizes()) {
279       return AllocatedSize(ptr);
280     }
281     return 0;
282   }
283 
284   // Fills in 'stats' with statistics collected by this allocator.
GetStats()285   virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; }
286 
287   // If implemented, clears the internal stats except for the `in_use` fields
288   // and sets the `peak_bytes_in_use` to be equal to the `bytes_in_use`. Returns
289   //  true if implemented.
290   //
291   // REQUIRES: GetStats is overridden.
ClearStats()292   virtual bool ClearStats() TF_MUST_USE_RESULT { return false; }
293 
SetSafeFrontier(uint64 count)294   virtual void SetSafeFrontier(uint64 count) {}
295 
296   // For allocator that are stream aware, allow to specify the compute
297   // stream this allocator is used for.
SetStream(void * stream)298   virtual void SetStream(void* stream) {}
299 };
300 
301 // An implementation of Allocator that delegates all calls to another Allocator.
302 //
303 // Useful to clients who want to override part of the functionality of another
304 // allocator.
305 class AllocatorWrapper : public Allocator {
306  public:
AllocatorWrapper(Allocator * wrapped)307   explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {}
308 
~AllocatorWrapper()309   ~AllocatorWrapper() override {}
310 
311   // Returns the wrapped allocator to which all calls are delegated.
wrapped()312   Allocator* wrapped() const { return wrapped_; }
313 
Name()314   std::string Name() override { return wrapped_->Name(); }
315 
AllocateRaw(size_t alignment,size_t num_bytes)316   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
317     return wrapped_->AllocateRaw(alignment, num_bytes);
318   }
319 
AllocateRaw(size_t alignment,size_t num_bytes,const AllocationAttributes & allocation_attr)320   void* AllocateRaw(size_t alignment, size_t num_bytes,
321                     const AllocationAttributes& allocation_attr) override {
322     return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr);
323   }
324 
DeallocateRaw(void * ptr)325   void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); }
326 
TracksAllocationSizes()327   bool TracksAllocationSizes() const override {
328     return wrapped_->TracksAllocationSizes();
329   }
330 
AllocatesOpaqueHandle()331   bool AllocatesOpaqueHandle() const override {
332     return wrapped_->AllocatesOpaqueHandle();
333   }
334 
RequestedSize(const void * ptr)335   size_t RequestedSize(const void* ptr) const override {
336     return wrapped_->RequestedSize(ptr);
337   }
338 
AllocatedSize(const void * ptr)339   size_t AllocatedSize(const void* ptr) const override {
340     return wrapped_->AllocatedSize(ptr);
341   }
342 
AllocationId(const void * ptr)343   int64 AllocationId(const void* ptr) const override {
344     return wrapped_->AllocationId(ptr);
345   }
346 
AllocatedSizeSlow(const void * ptr)347   size_t AllocatedSizeSlow(const void* ptr) const override {
348     return wrapped_->AllocatedSizeSlow(ptr);
349   }
350 
351  private:
352   Allocator* const wrapped_;
353 };
354 
355 // A tensorflow Op may need access to different kinds of memory that
356 // are not simply a function of the device to which the Op has been
357 // assigned.  For example, an Op executing on a GPU may still need
358 // to allocate CPU RAM for some purpose.  Internal to the tensorflow
359 // runtime we may choose to allocate CPU ram from special regions
360 // that have been prepared for higher performance in some use
361 // contexts, e.g. doing DMA with particular devices.  For these
362 // reasons, the Device interface does not expose just one memory
363 // Allocator, but instead provides an accessor that takes a
364 // specification of the desired memory attributes in order to select
365 // an Allocator.
366 //
367 // Example use:
368 //  // Allocator for ordinary device memory:
369 //  Allocator* a = allocator(AllocatorAttributes());
370 // ...
371 //  // Allocator for CPU RAM, regardless of where Op is executing:
372 //  AllocatorAttributes attr;
373 //  attr.set_on_host(true);
374 //  Allocator* a = allocator(attr);
375 struct AllocatorAttributes {
set_on_hostAllocatorAttributes376   void set_on_host(bool v) { value |= (static_cast<int>(v)); }
on_hostAllocatorAttributes377   bool on_host() const { return value & 0x1; }
set_nic_compatibleAllocatorAttributes378   void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); }
nic_compatibleAllocatorAttributes379   bool nic_compatible() const { return value & (0x1 << 1); }
set_gpu_compatibleAllocatorAttributes380   void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); }
gpu_compatibleAllocatorAttributes381   bool gpu_compatible() const { return value & (0x1 << 2); }
MergeAllocatorAttributes382   void Merge(AllocatorAttributes other) {
383     value |= other.value;
384     if (scope_id != other.scope_id) {
385       CHECK(scope_id == 0 || other.scope_id == 0)
386           << "At least one scope_id should be zero to merge "
387              "AllocatorAttributes but found this.scope_id="
388           << scope_id << " and other.scope_id=" << other.scope_id;
389       scope_id = scope_id == 0 ? other.scope_id : scope_id;
390     }
391   }
392   // Returns true if the fields set in *this is a subset of or equal to
393   // those set in other.
IsEqualOrLessRestrictiveThanAllocatorAttributes394   bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const {
395     return (value | other.value) == other.value;
396   }
397 
398   // NOTE: The upper 8 bits of the value are reserved for
399   // device-specific uses.  Implementors of a device can interpret these
400   // upper 8 bits in device-specific ways, and ops implemented for those
401   // devices are responsible for setting those 8 bits appropriately.
402   uint32 value = 0;
403   // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to
404   // a named special-purpose allocator on the same device.
405   int32 scope_id = 0;
406 
407   // Returns a human readable representation of this.
408   std::string DebugString() const;
409 };
410 
411 // Returns a trivial implementation of Allocator, which is a process singleton.
412 // Access through this function is only intended for use by restricted parts
413 // of the infrastructure.
414 Allocator* cpu_allocator_base();
415 
416 // If available, calls ProcessState::GetCPUAllocator(numa_node).
417 // If not, falls back to cpu_allocator_base().
418 // Intended for use in contexts where ProcessState is not visible at
419 // compile time. Where ProcessState is visible, it's preferable to
420 // call it directly.
421 Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity);
422 
423 // Enables AllocatorStats in the default CPU allocator implementation.  By
424 // default, it's disabled.
425 void EnableCPUAllocatorStats();
426 // Disables AllocatorStats in the default CPU allocator implementation.  By
427 // default, it's disabled.
428 void DisableCPUAllocatorStats();
429 bool CPUAllocatorStatsEnabled();
430 
431 // Enables full statistics collection in the default CPU allocator
432 // implementation.  By default, it's disabled.
433 void EnableCPUAllocatorFullStats();
434 bool CPUAllocatorFullStatsEnabled();
435 
436 // An object that does the underlying suballoc/free of memory for a higher-level
437 // allocator.  The expectation is that the higher-level allocator is doing some
438 // kind of cache or pool management so that it will call SubAllocator::Alloc and
439 // Free relatively infrequently, compared to the number of times its own
440 // AllocateRaw and Free methods are called.
441 class SubAllocator {
442  public:
443   // Visitor gets called with a pointer to a memory area and its
444   // size in bytes.  The index value will be numa_node for a CPU
445   // allocator and GPU id for a GPU allocator.
446   typedef std::function<void(void*, int index, size_t)> Visitor;
447 
448   SubAllocator(const std::vector<Visitor>& alloc_visitors,
449                const std::vector<Visitor>& free_visitors);
450 
~SubAllocator()451   virtual ~SubAllocator() {}
452   // Allocates at least num_bytes. Returns actual number of bytes allocated in
453   // bytes_received. The caller can safely use the full bytes_received sized
454   // buffer following the returend pointer.
455   virtual void* Alloc(size_t alignment, size_t num_bytes,
456                       size_t* bytes_received) = 0;
457   virtual void Free(void* ptr, size_t num_bytes) = 0;
458 
459   // Returns true if the BFC allocator can safely coalesce adjacent regions
460   // returned by this allocator.
461   virtual bool SupportsCoalescing() const = 0;
462 
463  protected:
464   // Implementation of Alloc() method must call this on newly allocated
465   // value.
466   void VisitAlloc(void* ptr, int index, size_t num_bytes);
467 
468   // Implementation of Free() method must call this on value to be
469   // freed immediately before deallocation.
470   void VisitFree(void* ptr, int index, size_t num_bytes);
471 
472   const std::vector<Visitor> alloc_visitors_;
473   const std::vector<Visitor> free_visitors_;
474 };
475 
476 }  // namespace tensorflow
477 
478 #endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_H_
479