1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/framework/allocator.h"
17
18 #include "tensorflow/core/framework/allocator_registry.h"
19 #include "tensorflow/core/framework/log_memory.h"
20 #include "tensorflow/core/framework/tracking_allocator.h"
21 #include "tensorflow/core/framework/variant.h"
22 #include "tensorflow/core/lib/strings/stringprintf.h"
23 #include "tensorflow/core/platform/mem.h"
24 #include "tensorflow/core/platform/mutex.h"
25 #include "tensorflow/core/platform/types.h"
26
27 namespace tensorflow {
28
DebugString() const29 string AllocatorStats::DebugString() const {
30 return strings::Printf(
31 "Limit: %20lld\n"
32 "InUse: %20lld\n"
33 "MaxInUse: %20lld\n"
34 "NumAllocs: %20lld\n"
35 "MaxAllocSize: %20lld\n",
36 this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
37 this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size);
38 }
39
40 constexpr size_t Allocator::kAllocatorAlignment;
41
~Allocator()42 Allocator::~Allocator() {}
43
RunResourceCtor(ResourceHandle * p,size_t n)44 void RunResourceCtor(ResourceHandle* p, size_t n) {
45 for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle();
46 }
47
RunResourceDtor(ResourceHandle * p,size_t n)48 void RunResourceDtor(ResourceHandle* p, size_t n) {
49 for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle();
50 }
51
RunVariantCtor(Variant * p,size_t n)52 void Allocator::RunVariantCtor(Variant* p, size_t n) {
53 for (size_t i = 0; i < n; ++p, ++i) new (p) Variant();
54 }
55
RunVariantDtor(Variant * p,size_t n)56 void Allocator::RunVariantDtor(Variant* p, size_t n) {
57 for (size_t i = 0; i < n; ++p, ++i) p->~Variant();
58 }
59
60 // If true, cpu allocator collects more stats.
61 static bool cpu_allocator_collect_stats = false;
62 // If true, cpu allocator collects full stats.
63 static bool cpu_allocator_collect_full_stats = false;
64
65 // Individual allocations large than this amount will trigger a warning.
66 static const double kLargeAllocationWarningThreshold = 0.1;
67
68 // If cpu_allocator_collect_stats is true, warn when the total allocated memory
69 // exceeds this threshold.
70 static const double kTotalAllocationWarningThreshold = 0.5;
71
72 static const int kMaxSingleAllocationWarnings = 5;
73 static const int kMaxTotalAllocationWarnings = 1;
74
75 // Cache first invocation to port::AvailableRam, as it can be expensive.
LargeAllocationWarningBytes()76 static int64_t LargeAllocationWarningBytes() {
77 static int64_t value = static_cast<int64>(port::AvailableRam() *
78 kLargeAllocationWarningThreshold);
79 return value;
80 }
81
TotalAllocationWarningBytes()82 static int64_t TotalAllocationWarningBytes() {
83 static int64_t value = static_cast<int64>(port::AvailableRam() *
84 kTotalAllocationWarningThreshold);
85 return value;
86 }
87
EnableCPUAllocatorStats(bool enable)88 void EnableCPUAllocatorStats(bool enable) {
89 cpu_allocator_collect_stats = enable;
90 }
CPUAllocatorStatsEnabled()91 bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
EnableCPUAllocatorFullStats(bool enable)92 void EnableCPUAllocatorFullStats(bool enable) {
93 cpu_allocator_collect_full_stats = enable;
94 }
CPUAllocatorFullStatsEnabled()95 bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
96
97 namespace {
98 // A default Allocator for CPU devices. ProcessState::GetCPUAllocator() will
99 // return a different version that may perform better, but may also lack the
100 // optional stats triggered by the functions above. TODO(tucker): migrate all
101 // uses of cpu_allocator() except tests to use ProcessState instead.
102 class CPUAllocator : public Allocator {
103 public:
CPUAllocator()104 CPUAllocator()
105 : single_allocation_warning_count_(0),
106 total_allocation_warning_count_(0) {}
107
~CPUAllocator()108 ~CPUAllocator() override {}
109
Name()110 string Name() override { return "cpu"; }
111
AllocateRaw(size_t alignment,size_t num_bytes)112 void* AllocateRaw(size_t alignment, size_t num_bytes) override {
113 if (num_bytes > LargeAllocationWarningBytes() &&
114 single_allocation_warning_count_ < kMaxSingleAllocationWarnings) {
115 ++single_allocation_warning_count_;
116 LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
117 << 100 * kLargeAllocationWarningThreshold
118 << "% of system memory.";
119 }
120
121 void* p = port::AlignedMalloc(num_bytes, alignment);
122 if (cpu_allocator_collect_stats) {
123 const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
124 mutex_lock l(mu_);
125 ++stats_.num_allocs;
126 stats_.bytes_in_use += alloc_size;
127 stats_.peak_bytes_in_use =
128 std::max<int64>(stats_.peak_bytes_in_use, stats_.bytes_in_use);
129 stats_.largest_alloc_size =
130 std::max<int64>(stats_.largest_alloc_size, alloc_size);
131
132 if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
133 total_allocation_warning_count_ < kMaxTotalAllocationWarnings) {
134 ++total_allocation_warning_count_;
135 LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use
136 << "exceeds " << 100 * kTotalAllocationWarningThreshold
137 << "% of system memory";
138 }
139 }
140 return p;
141 }
142
DeallocateRaw(void * ptr)143 void DeallocateRaw(void* ptr) override {
144 if (cpu_allocator_collect_stats) {
145 const std::size_t alloc_size =
146 port::MallocExtension_GetAllocatedSize(ptr);
147 mutex_lock l(mu_);
148 stats_.bytes_in_use -= alloc_size;
149 }
150 port::AlignedFree(ptr);
151 }
152
GetStats()153 absl::optional<AllocatorStats> GetStats() override {
154 mutex_lock l(mu_);
155 return stats_;
156 }
157
ClearStats()158 void ClearStats() override {
159 mutex_lock l(mu_);
160 stats_.num_allocs = 0;
161 stats_.peak_bytes_in_use = stats_.bytes_in_use;
162 stats_.largest_alloc_size = 0;
163 }
164
AllocatedSizeSlow(const void * ptr)165 size_t AllocatedSizeSlow(const void* ptr) override {
166 return port::MallocExtension_GetAllocatedSize(ptr);
167 }
168
169 private:
170 mutex mu_;
171 AllocatorStats stats_ GUARDED_BY(mu_);
172
173 // Use <atomic> for single allocations to avoid mutex contention when
174 // statistics are disabled.
175 std::atomic<int> single_allocation_warning_count_;
176 int total_allocation_warning_count_ GUARDED_BY(mu_);
177
178 TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
179 };
180
181 class CPUAllocatorFactory : public AllocatorFactory {
182 public:
CreateAllocator()183 Allocator* CreateAllocator() override { return new CPUAllocator; }
184
CreateSubAllocator(int numa_node)185 SubAllocator* CreateSubAllocator(int numa_node) override {
186 return new CPUSubAllocator(new CPUAllocator);
187 }
188
189 private:
190 class CPUSubAllocator : public SubAllocator {
191 public:
CPUSubAllocator(CPUAllocator * cpu_allocator)192 explicit CPUSubAllocator(CPUAllocator* cpu_allocator)
193 : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {}
194
Alloc(size_t alignment,size_t num_bytes)195 void* Alloc(size_t alignment, size_t num_bytes) override {
196 return cpu_allocator_->AllocateRaw(alignment, num_bytes);
197 }
198
Free(void * ptr,size_t num_bytes)199 void Free(void* ptr, size_t num_bytes) override {
200 cpu_allocator_->DeallocateRaw(ptr);
201 }
202
203 private:
204 CPUAllocator* cpu_allocator_;
205 };
206 };
207
208 REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocatorFactory);
209 } // namespace
210
cpu_allocator_base()211 Allocator* cpu_allocator_base() {
212 static Allocator* cpu_alloc =
213 AllocatorFactoryRegistry::singleton()->GetAllocator();
214 // TODO(tucker): This really seems wrong. It's only going to be effective on
215 // the first call in a process (but the desired effect is associated with a
216 // session), and we probably ought to be tracking the highest level Allocator,
217 // not the lowest. Revisit the advertised semantics of the triggering option.
218 if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
219 cpu_alloc = new TrackingAllocator(cpu_alloc, true);
220 }
221 return cpu_alloc;
222 }
223
cpu_allocator(int numa_node)224 Allocator* cpu_allocator(int numa_node) {
225 // Correctness relies on devices being created prior to the first call
226 // to cpu_allocator, if devices are ever to be created in the process.
227 // Device creation in turn triggers ProcessState creation and the availability
228 // of the correct access pointer via this function call.
229 static ProcessStateInterface* ps =
230 AllocatorFactoryRegistry::singleton()->process_state();
231 if (ps) {
232 return ps->GetCPUAllocator(numa_node);
233 } else {
234 return cpu_allocator_base();
235 }
236 }
237
SubAllocator(const std::vector<Visitor> & alloc_visitors,const std::vector<Visitor> & free_visitors)238 SubAllocator::SubAllocator(const std::vector<Visitor>& alloc_visitors,
239 const std::vector<Visitor>& free_visitors)
240 : alloc_visitors_(alloc_visitors), free_visitors_(free_visitors) {}
241
VisitAlloc(void * ptr,int index,size_t num_bytes)242 void SubAllocator::VisitAlloc(void* ptr, int index, size_t num_bytes) {
243 for (const auto& v : alloc_visitors_) {
244 v(ptr, index, num_bytes);
245 }
246 }
247
VisitFree(void * ptr,int index,size_t num_bytes)248 void SubAllocator::VisitFree(void* ptr, int index, size_t num_bytes) {
249 // Although we don't guarantee any order of visitor application, strive
250 // to apply free visitors in reverse order of alloc visitors.
251 for (int i = free_visitors_.size() - 1; i >= 0; --i) {
252 free_visitors_[i](ptr, index, num_bytes);
253 }
254 }
255 } // namespace tensorflow
256