1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // A simple CPU allocator that intercepts malloc/free calls from MKL library 17 // and redirects them to Tensorflow allocator 18 19 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ 20 #define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ 21 22 #ifdef INTEL_MKL 23 24 #include <cstdlib> 25 #include "tensorflow/core/common_runtime/bfc_allocator.h" 26 #include "tensorflow/core/common_runtime/pool_allocator.h" 27 #include "tensorflow/core/lib/strings/numbers.h" 28 #include "tensorflow/core/lib/strings/str_util.h" 29 #include "tensorflow/core/platform/mem.h" 30 #include "tensorflow/core/platform/numa.h" 31 32 #ifndef INTEL_MKL_DNN_ONLY 33 #include "i_malloc.h" 34 #endif 35 36 #ifdef _WIN32 37 typedef unsigned int uint; 38 #endif 39 40 namespace tensorflow { 41 42 static bool mkl_small_allocator_collect_stats = false; 43 44 class MklSubAllocator : public BasicCPUAllocator { 45 public: MklSubAllocator()46 MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {} ~MklSubAllocator()47 ~MklSubAllocator() override {} 48 }; 49 50 // CPU allocator that handles small-size allocations by calling 51 // suballocator directly. Mostly, it is just a wrapper around a suballocator 52 // (that calls malloc and free directly) with support for bookkeeping. 53 class MklSmallSizeAllocator : public Allocator { 54 public: MklSmallSizeAllocator(SubAllocator * sub_allocator,size_t total_memory,const string & name)55 MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory, 56 const string& name) 57 : sub_allocator_(sub_allocator), name_(name) { 58 stats_.bytes_limit = total_memory; 59 } ~MklSmallSizeAllocator()60 ~MklSmallSizeAllocator() override {} 61 62 TF_DISALLOW_COPY_AND_ASSIGN(MklSmallSizeAllocator); 63 Name()64 inline string Name() override { return name_; } 65 AllocateRaw(size_t alignment,size_t num_bytes)66 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 67 void* ptr = port::AlignedMalloc(num_bytes, alignment); 68 if (mkl_small_allocator_collect_stats) IncrementStats(num_bytes); 69 return ptr; 70 } 71 DeallocateRaw(void * ptr)72 void DeallocateRaw(void* ptr) override { 73 if (ptr == nullptr) { 74 LOG(ERROR) << "tried to deallocate nullptr"; 75 return; 76 } 77 78 if (mkl_small_allocator_collect_stats) { 79 const size_t alloc_size = port::MallocExtension_GetAllocatedSize(ptr); 80 DecrementStats(alloc_size); 81 } 82 port::AlignedFree(ptr); 83 } 84 GetStats()85 absl::optional<AllocatorStats> GetStats() override { 86 mutex_lock l(mutex_); 87 return stats_; 88 } 89 ClearStats()90 void ClearStats() override { 91 mutex_lock l(mutex_); 92 stats_.num_allocs = 0; 93 stats_.peak_bytes_in_use = 0; 94 stats_.largest_alloc_size = 0; 95 stats_.bytes_in_use = 0; 96 stats_.bytes_limit = 0; 97 } 98 99 private: 100 // Increment statistics for the allocator handling small allocations. IncrementStats(size_t alloc_size)101 inline void IncrementStats(size_t alloc_size) TF_LOCKS_EXCLUDED(mutex_) { 102 mutex_lock l(mutex_); 103 ++stats_.num_allocs; 104 stats_.bytes_in_use += alloc_size; 105 stats_.peak_bytes_in_use = 106 std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use); 107 stats_.largest_alloc_size = 108 std::max(alloc_size, static_cast<size_t>(stats_.largest_alloc_size)); 109 } 110 111 // Decrement statistics for the allocator handling small allocations. DecrementStats(size_t dealloc_size)112 inline void DecrementStats(size_t dealloc_size) TF_LOCKS_EXCLUDED(mutex_) { 113 mutex_lock l(mutex_); 114 stats_.bytes_in_use -= dealloc_size; 115 } 116 117 SubAllocator* sub_allocator_; // Not owned by this class. 118 119 // Mutex for protecting updates to map of allocations. 120 mutable mutex mutex_; 121 122 // Allocator name 123 string name_; 124 125 // Allocator stats for small allocs 126 AllocatorStats stats_ TF_GUARDED_BY(mutex_); 127 }; 128 129 /// CPU allocator for MKL that wraps BFC allocator and intercepts 130 /// and redirects memory allocation calls from MKL. 131 class MklCPUAllocator : public Allocator { 132 public: 133 // Constructor and other standard functions 134 135 /// Environment variable that user can set to upper bound on memory allocation 136 static constexpr const char* kMaxLimitStr = "TF_MKL_ALLOC_MAX_BYTES"; 137 138 /// Default upper limit on allocator size - 64GB 139 static constexpr size_t kDefaultMaxLimit = 64LL << 30; 140 MklCPUAllocator()141 MklCPUAllocator() { TF_CHECK_OK(Initialize()); } 142 ~MklCPUAllocator()143 ~MklCPUAllocator() override { 144 delete small_size_allocator_; 145 delete large_size_allocator_; 146 } 147 Initialize()148 Status Initialize() { 149 VLOG(2) << "MklCPUAllocator: In MklCPUAllocator"; 150 151 // Set upper bound on memory allocation to physical RAM available on the 152 // CPU unless explicitly specified by user 153 uint64 max_mem_bytes = kDefaultMaxLimit; 154 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) 155 max_mem_bytes = 156 (uint64)sysconf(_SC_PHYS_PAGES) * (uint64)sysconf(_SC_PAGESIZE); 157 #endif 158 char* user_mem_bytes = getenv(kMaxLimitStr); 159 160 if (user_mem_bytes != NULL) { 161 uint64 user_val = 0; 162 if (!strings::safe_strtou64(user_mem_bytes, &user_val)) { 163 return errors::InvalidArgument("Invalid memory limit (", user_mem_bytes, 164 ") specified for MKL allocator through ", 165 kMaxLimitStr); 166 } 167 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) 168 if (user_val > max_mem_bytes) { 169 LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr 170 << "=" << user_val 171 << " greater than available physical memory: " 172 << max_mem_bytes 173 << ". This could significantly reduce performance!"; 174 } 175 #endif 176 max_mem_bytes = user_val; 177 } 178 179 VLOG(1) << "MklCPUAllocator: Setting max_mem_bytes: " << max_mem_bytes; 180 181 sub_allocator_ = new MklSubAllocator(); 182 183 // SubAllocator is owned by BFCAllocator, so we do not need to deallocate 184 // it in MklSmallSizeAllocator. 185 small_size_allocator_ = 186 new MklSmallSizeAllocator(sub_allocator_, max_mem_bytes, kName); 187 large_size_allocator_ = 188 new BFCAllocator(sub_allocator_, max_mem_bytes, kAllowGrowth, kName); 189 #ifndef INTEL_MKL_DNN_ONLY 190 // For redirecting all allocations from MKL to this allocator 191 // From: http://software.intel.com/en-us/node/528565 192 i_malloc = MallocHook; 193 i_calloc = CallocHook; 194 i_realloc = ReallocHook; 195 i_free = FreeHook; 196 #endif 197 return Status::OK(); 198 } 199 Name()200 inline string Name() override { return kName; } IsSmallSizeAllocation(const void * ptr)201 inline bool IsSmallSizeAllocation(const void* ptr) const 202 TF_LOCKS_EXCLUDED(mutex_) { 203 mutex_lock l(mutex_); 204 return large_allocations_map_.find(ptr) == large_allocations_map_.end(); 205 } 206 // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held AddLargeAllocMap(void * ptr,size_t num_bytes)207 inline void AddLargeAllocMap(void* ptr, size_t num_bytes) 208 TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) { 209 if (ptr != nullptr) { 210 std::pair<void*, size_t> map_val(ptr, num_bytes); 211 large_allocations_map_.insert(map_val); 212 } 213 } RemoveLargeAllocMap(void * ptr)214 inline void RemoveLargeAllocMap(void* ptr) 215 TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) { 216 auto map_iter = large_allocations_map_.find(ptr); 217 if (map_iter != large_allocations_map_.end()) { 218 large_allocations_map_.erase(map_iter); 219 } else { 220 LOG(ERROR) << "tried to deallocate invalid pointer"; 221 } 222 return; 223 } 224 AllocateRaw(size_t alignment,size_t num_bytes)225 inline void* AllocateRaw(size_t alignment, size_t num_bytes) override { 226 // If the allocation size is less than threshold, call small allocator, 227 // otherwise call large-size allocator (BFC). We found that BFC allocator 228 // does not deliver good performance for small allocations when 229 // inter_op_parallelism_threads is high. 230 if (num_bytes < kSmallAllocationsThreshold) { 231 return small_size_allocator_->AllocateRaw(alignment, num_bytes); 232 } else { 233 mutex_lock l(mutex_); 234 void* ptr = large_size_allocator_->AllocateRaw(alignment, num_bytes); 235 AddLargeAllocMap(ptr, num_bytes); 236 return ptr; 237 } 238 } 239 DeallocateRaw(void * ptr)240 inline void DeallocateRaw(void* ptr) override { 241 // Check if ptr is for "small" allocation. If it is, then call Free 242 // directly. Otherwise, call BFC to handle free. 243 if (IsSmallSizeAllocation(ptr)) { 244 small_size_allocator_->DeallocateRaw(ptr); 245 } else { 246 mutex_lock l(mutex_); 247 RemoveLargeAllocMap(ptr); 248 large_size_allocator_->DeallocateRaw(ptr); 249 } 250 } 251 GetStats()252 absl::optional<AllocatorStats> GetStats() override { 253 auto s_stats = small_size_allocator_->GetStats(); 254 auto l_stats = large_size_allocator_->GetStats(); 255 256 // Combine statistics from small-size and large-size allocator. 257 mutex_lock l(mutex_); 258 stats_.num_allocs = l_stats->num_allocs + s_stats->num_allocs; 259 stats_.bytes_in_use = l_stats->bytes_in_use + s_stats->bytes_in_use; 260 stats_.peak_bytes_in_use = 261 l_stats->peak_bytes_in_use + s_stats->peak_bytes_in_use; 262 263 // Since small-size allocations go to MklSmallSizeAllocator, 264 // max_alloc_size from large_size_allocator would be the maximum 265 // size allocated by MklCPUAllocator. 266 stats_.largest_alloc_size = l_stats->largest_alloc_size; 267 stats_.bytes_limit = std::max(s_stats->bytes_limit, l_stats->bytes_limit); 268 return stats_; 269 } 270 ClearStats()271 void ClearStats() override { 272 small_size_allocator_->ClearStats(); 273 large_size_allocator_->ClearStats(); 274 } 275 276 private: 277 // Hooks provided by this allocator for memory allocation routines from MKL 278 MallocHook(size_t size)279 static inline void* MallocHook(size_t size) { 280 VLOG(3) << "MklCPUAllocator: In MallocHook"; 281 return cpu_allocator()->AllocateRaw(kAlignment, size); 282 } 283 FreeHook(void * ptr)284 static inline void FreeHook(void* ptr) { 285 VLOG(3) << "MklCPUAllocator: In FreeHook"; 286 cpu_allocator()->DeallocateRaw(ptr); 287 } 288 CallocHook(size_t num,size_t size)289 static inline void* CallocHook(size_t num, size_t size) { 290 Status s = Status(error::Code::UNIMPLEMENTED, 291 "Unimplemented case for hooking MKL function."); 292 TF_CHECK_OK(s); // way to assert with an error message 293 return nullptr; // return a value and make static code analyzers happy 294 } 295 ReallocHook(void * ptr,size_t size)296 static inline void* ReallocHook(void* ptr, size_t size) { 297 Status s = Status(error::Code::UNIMPLEMENTED, 298 "Unimplemented case for hooking MKL function."); 299 TF_CHECK_OK(s); // way to assert with an error message 300 return nullptr; // return a value and make static code analyzers happy 301 } 302 303 // Do we allow growth in BFC Allocator 304 static const bool kAllowGrowth = true; 305 306 // Name 307 static constexpr const char* kName = "mklcpu"; 308 309 // The alignment that we need for the allocations 310 static constexpr const size_t kAlignment = 64; 311 312 Allocator* large_size_allocator_; // owned by this class 313 MklSmallSizeAllocator* small_size_allocator_; // owned by this class. 314 315 SubAllocator* sub_allocator_; // not owned by this class 316 mutable mutex mutex_; 317 AllocatorStats stats_ TF_GUARDED_BY(mutex_); 318 319 // Hash map to keep track of "BFC" allocations 320 // We do not use BFC allocator for small allocations. 321 std::unordered_map<const void*, size_t> large_allocations_map_ 322 TF_GUARDED_BY(mutex_); 323 324 // Size in bytes that defines the upper-bound for "small" allocations. 325 // Any allocation below this threshold is "small" allocation. 326 static constexpr const size_t kSmallAllocationsThreshold = 4096; 327 328 // Prevent copying and assignment 329 TF_DISALLOW_COPY_AND_ASSIGN(MklCPUAllocator); 330 }; 331 332 } // namespace tensorflow 333 334 #endif // INTEL_MKL 335 336 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ 337