1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // A simple CPU allocator that intercepts malloc/free calls from MKL library 17 // and redirects them to Tensorflow allocator 18 19 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ 20 #define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ 21 22 #ifdef INTEL_MKL 23 24 #include <cstdlib> 25 #include "tensorflow/core/common_runtime/bfc_allocator.h" 26 #include "tensorflow/core/common_runtime/pool_allocator.h" 27 #include "tensorflow/core/lib/strings/numbers.h" 28 #include "tensorflow/core/lib/strings/str_util.h" 29 #include "tensorflow/core/platform/mem.h" 30 #include "tensorflow/core/platform/numa.h" 31 32 #ifndef INTEL_MKL_DNN_ONLY 33 #include "i_malloc.h" 34 #endif 35 36 #ifdef _WIN32 37 typedef unsigned int uint; 38 #endif 39 40 namespace tensorflow { 41 42 static bool mkl_small_allocator_collect_stats = false; 43 44 class MklSubAllocator : public BasicCPUAllocator { 45 public: MklSubAllocator()46 MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {} ~MklSubAllocator()47 ~MklSubAllocator() override {} 48 }; 49 50 // CPU allocator that handles small-size allocations by calling 51 // suballocator directly. Mostly, it is just a wrapper around a suballocator 52 // (that calls malloc and free directly) with support for bookkeeping. 53 class MklSmallSizeAllocator : public Allocator { 54 public: MklSmallSizeAllocator(SubAllocator * sub_allocator,size_t total_memory,const string & name)55 MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory, 56 const string& name) 57 : sub_allocator_(sub_allocator), name_(name) { 58 stats_.bytes_limit = total_memory; 59 } ~MklSmallSizeAllocator()60 ~MklSmallSizeAllocator() override {} 61 62 TF_DISALLOW_COPY_AND_ASSIGN(MklSmallSizeAllocator); 63 Name()64 inline string Name() override { return name_; } 65 AllocateRaw(size_t alignment,size_t num_bytes)66 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 67 void* ptr = port::AlignedMalloc(num_bytes, alignment); 68 if (mkl_small_allocator_collect_stats) IncrementStats(num_bytes); 69 return ptr; 70 } 71 DeallocateRaw(void * ptr)72 void DeallocateRaw(void* ptr) override { 73 if (ptr == nullptr) { 74 LOG(ERROR) << "tried to deallocate nullptr"; 75 return; 76 } 77 78 if (mkl_small_allocator_collect_stats) { 79 const size_t alloc_size = port::MallocExtension_GetAllocatedSize(ptr); 80 DecrementStats(alloc_size); 81 } 82 port::AlignedFree(ptr); 83 } 84 GetStats()85 absl::optional<AllocatorStats> GetStats() override { 86 mutex_lock l(mutex_); 87 return stats_; 88 } 89 ClearStats()90 void ClearStats() override { 91 mutex_lock l(mutex_); 92 stats_.num_allocs = 0; 93 stats_.peak_bytes_in_use = 0; 94 stats_.largest_alloc_size = 0; 95 stats_.bytes_in_use = 0; 96 stats_.bytes_limit = 0; 97 } 98 99 private: 100 // Increment statistics for the allocator handling small allocations. IncrementStats(size_t alloc_size)101 inline void IncrementStats(size_t alloc_size) LOCKS_EXCLUDED(mutex_) { 102 mutex_lock l(mutex_); 103 ++stats_.num_allocs; 104 stats_.bytes_in_use += alloc_size; 105 stats_.peak_bytes_in_use = 106 std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use); 107 stats_.largest_alloc_size = 108 std::max(alloc_size, static_cast<size_t>(stats_.largest_alloc_size)); 109 } 110 111 // Decrement statistics for the allocator handling small allocations. DecrementStats(size_t dealloc_size)112 inline void DecrementStats(size_t dealloc_size) LOCKS_EXCLUDED(mutex_) { 113 mutex_lock l(mutex_); 114 stats_.bytes_in_use -= dealloc_size; 115 } 116 117 SubAllocator* sub_allocator_; // Not owned by this class. 118 119 // Mutex for protecting updates to map of allocations. 120 mutable mutex mutex_; 121 122 // Allocator name 123 string name_; 124 125 // Allocator stats for small allocs 126 AllocatorStats stats_ GUARDED_BY(mutex_); 127 }; 128 129 /// CPU allocator for MKL that wraps BFC allocator and intercepts 130 /// and redirects memory allocation calls from MKL. 131 class MklCPUAllocator : public Allocator { 132 public: 133 // Constructor and other standard functions 134 135 /// Environment variable that user can set to upper bound on memory allocation 136 static constexpr const char* kMaxLimitStr = "TF_MKL_ALLOC_MAX_BYTES"; 137 138 /// Default upper limit on allocator size - 64GB 139 static constexpr size_t kDefaultMaxLimit = 64LL << 30; 140 MklCPUAllocator()141 MklCPUAllocator() { TF_CHECK_OK(Initialize()); } 142 ~MklCPUAllocator()143 ~MklCPUAllocator() override { 144 delete small_size_allocator_; 145 delete large_size_allocator_; 146 } 147 Initialize()148 Status Initialize() { 149 VLOG(2) << "MklCPUAllocator: In MklCPUAllocator"; 150 151 // Set upper bound on memory allocation to physical RAM available on the 152 // CPU unless explicitly specified by user 153 uint64 max_mem_bytes = kDefaultMaxLimit; 154 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) 155 max_mem_bytes = 156 (uint64)sysconf(_SC_PHYS_PAGES) * (uint64)sysconf(_SC_PAGESIZE); 157 #endif 158 char* user_mem_bytes = getenv(kMaxLimitStr); 159 160 if (user_mem_bytes != NULL) { 161 uint64 user_val = 0; 162 if (!strings::safe_strtou64(user_mem_bytes, &user_val)) { 163 return errors::InvalidArgument("Invalid memory limit (", user_mem_bytes, 164 ") specified for MKL allocator through ", 165 kMaxLimitStr); 166 } 167 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) 168 if (user_val > max_mem_bytes) { 169 LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr 170 << "=" << user_val 171 << " greater than available physical memory: " 172 << max_mem_bytes 173 << ". This could significantly reduce performance!"; 174 } 175 #endif 176 max_mem_bytes = user_val; 177 } 178 179 VLOG(1) << "MklCPUAllocator: Setting max_mem_bytes: " << max_mem_bytes; 180 181 sub_allocator_ = new MklSubAllocator(); 182 183 // SubAllocator is owned by BFCAllocator, so we do not need to deallocate 184 // it in MklSmallSizeAllocator. 185 small_size_allocator_ = 186 new MklSmallSizeAllocator(sub_allocator_, max_mem_bytes, kName); 187 large_size_allocator_ = 188 new BFCAllocator(sub_allocator_, max_mem_bytes, kAllowGrowth, kName); 189 #ifndef INTEL_MKL_DNN_ONLY 190 // For redirecting all allocations from MKL to this allocator 191 // From: http://software.intel.com/en-us/node/528565 192 i_malloc = MallocHook; 193 i_calloc = CallocHook; 194 i_realloc = ReallocHook; 195 i_free = FreeHook; 196 #endif 197 return Status::OK(); 198 } 199 Name()200 inline string Name() override { return kName; } IsSmallSizeAllocation(const void * ptr)201 inline bool IsSmallSizeAllocation(const void* ptr) const 202 LOCKS_EXCLUDED(mutex_) { 203 mutex_lock l(mutex_); 204 return large_allocations_map_.find(ptr) == large_allocations_map_.end(); 205 } 206 // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held AddLargeAllocMap(void * ptr,size_t num_bytes)207 inline void AddLargeAllocMap(void* ptr, size_t num_bytes) 208 EXCLUSIVE_LOCKS_REQUIRED(mutex_) { 209 if (ptr != nullptr) { 210 std::pair<void*, size_t> map_val(ptr, num_bytes); 211 large_allocations_map_.insert(map_val); 212 } 213 } RemoveLargeAllocMap(void * ptr)214 inline void RemoveLargeAllocMap(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(mutex_) { 215 auto map_iter = large_allocations_map_.find(ptr); 216 if (map_iter != large_allocations_map_.end()) { 217 large_allocations_map_.erase(map_iter); 218 } else { 219 LOG(ERROR) << "tried to deallocate invalid pointer"; 220 } 221 return; 222 } 223 AllocateRaw(size_t alignment,size_t num_bytes)224 inline void* AllocateRaw(size_t alignment, size_t num_bytes) override { 225 // If the allocation size is less than threshold, call small allocator, 226 // otherwise call large-size allocator (BFC). We found that BFC allocator 227 // does not deliver good performance for small allocations when 228 // inter_op_parallelism_threads is high. 229 if (num_bytes < kSmallAllocationsThreshold) { 230 return small_size_allocator_->AllocateRaw(alignment, num_bytes); 231 } else { 232 mutex_lock l(mutex_); 233 void* ptr = large_size_allocator_->AllocateRaw(alignment, num_bytes); 234 AddLargeAllocMap(ptr, num_bytes); 235 return ptr; 236 } 237 } 238 DeallocateRaw(void * ptr)239 inline void DeallocateRaw(void* ptr) override { 240 // Check if ptr is for "small" allocation. If it is, then call Free 241 // directly. Otherwise, call BFC to handle free. 242 if (IsSmallSizeAllocation(ptr)) { 243 small_size_allocator_->DeallocateRaw(ptr); 244 } else { 245 mutex_lock l(mutex_); 246 RemoveLargeAllocMap(ptr); 247 large_size_allocator_->DeallocateRaw(ptr); 248 } 249 } 250 GetStats()251 absl::optional<AllocatorStats> GetStats() override { 252 auto s_stats = small_size_allocator_->GetStats(); 253 auto l_stats = large_size_allocator_->GetStats(); 254 255 // Combine statistics from small-size and large-size allocator. 256 stats_.num_allocs = l_stats->num_allocs + s_stats->num_allocs; 257 stats_.bytes_in_use = l_stats->bytes_in_use + s_stats->bytes_in_use; 258 stats_.peak_bytes_in_use = 259 l_stats->peak_bytes_in_use + s_stats->peak_bytes_in_use; 260 261 // Since small-size allocations go to MklSmallSizeAllocator, 262 // max_alloc_size from large_size_allocator would be the maximum 263 // size allocated by MklCPUAllocator. 264 stats_.largest_alloc_size = l_stats->largest_alloc_size; 265 stats_.bytes_limit = std::max(s_stats->bytes_limit, l_stats->bytes_limit); 266 return stats_; 267 } 268 ClearStats()269 void ClearStats() override { 270 small_size_allocator_->ClearStats(); 271 large_size_allocator_->ClearStats(); 272 } 273 274 private: 275 // Hooks provided by this allocator for memory allocation routines from MKL 276 MallocHook(size_t size)277 static inline void* MallocHook(size_t size) { 278 VLOG(3) << "MklCPUAllocator: In MallocHook"; 279 return cpu_allocator()->AllocateRaw(kAlignment, size); 280 } 281 FreeHook(void * ptr)282 static inline void FreeHook(void* ptr) { 283 VLOG(3) << "MklCPUAllocator: In FreeHook"; 284 cpu_allocator()->DeallocateRaw(ptr); 285 } 286 CallocHook(size_t num,size_t size)287 static inline void* CallocHook(size_t num, size_t size) { 288 Status s = Status(error::Code::UNIMPLEMENTED, 289 "Unimplemented case for hooking MKL function."); 290 TF_CHECK_OK(s); // way to assert with an error message 291 return nullptr; // return a value and make static code analyzers happy 292 } 293 ReallocHook(void * ptr,size_t size)294 static inline void* ReallocHook(void* ptr, size_t size) { 295 Status s = Status(error::Code::UNIMPLEMENTED, 296 "Unimplemented case for hooking MKL function."); 297 TF_CHECK_OK(s); // way to assert with an error message 298 return nullptr; // return a value and make static code analyzers happy 299 } 300 301 // Do we allow growth in BFC Allocator 302 static const bool kAllowGrowth = true; 303 304 // Name 305 static constexpr const char* kName = "mklcpu"; 306 307 // The alignment that we need for the allocations 308 static constexpr const size_t kAlignment = 64; 309 310 Allocator* large_size_allocator_; // owned by this class 311 MklSmallSizeAllocator* small_size_allocator_; // owned by this class. 312 313 SubAllocator* sub_allocator_; // not owned by this class 314 mutable mutex mutex_; 315 AllocatorStats stats_ GUARDED_BY(mutex_); 316 317 // Hash map to keep track of "BFC" allocations 318 // We do not use BFC allocator for small allocations. 319 std::unordered_map<const void*, size_t> large_allocations_map_ 320 GUARDED_BY(mutex_); 321 322 // Size in bytes that defines the upper-bound for "small" allocations. 323 // Any allocation below this threshold is "small" allocation. 324 static constexpr const size_t kSmallAllocationsThreshold = 4096; 325 326 // Prevent copying and assignment 327 TF_DISALLOW_COPY_AND_ASSIGN(MklCPUAllocator); 328 }; 329 330 } // namespace tensorflow 331 332 #endif // INTEL_MKL 333 334 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ 335