• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
17 
18 #include <cstring>
19 #include <vector>
20 
21 #include "absl/container/flat_hash_set.h"
22 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
23 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
27 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
28 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
29 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
30 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
31 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
32 #include "tensorflow/core/common_runtime/pool_allocator.h"
33 #include "tensorflow/core/common_runtime/shared_counter.h"
34 #include "tensorflow/core/framework/allocator.h"
35 #include "tensorflow/core/framework/log_memory.h"
36 #include "tensorflow/core/framework/tracking_allocator.h"
37 #include "tensorflow/core/lib/strings/strcat.h"
38 #include "tensorflow/core/platform/logging.h"
39 #include "tensorflow/core/platform/mutex.h"
40 #include "tensorflow/core/platform/stream_executor.h"
41 #include "tensorflow/core/platform/types.h"
42 #include "tensorflow/core/util/env_var.h"
43 
44 namespace tensorflow {
45 
46 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMallocAllocator()47 static bool UseCudaMallocAllocator() {
48   const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
49   return allocator_env != nullptr &&
50          std::strcmp(allocator_env, "cuda_malloc") == 0;
51 }
52 
53 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMemoryGuardAllocator()54 static bool UseCudaMemoryGuardAllocator() {
55   const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
56   return allocator_env != nullptr &&
57          std::strcmp(allocator_env, "memory_guard") == 0;
58 }
59 
60 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMallocAsyncAllocator()61 static bool UseCudaMallocAsyncAllocator() {
62   const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
63   auto result = allocator_env != nullptr &&
64                 std::strcmp(allocator_env, "cuda_malloc_async") == 0;
65 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
66   return result;
67 #else
68   if (result)
69     LOG(ERROR) << "TF_GPU_ALLOCATOR=cuda_malloc_async environment found, "
70                << "but TensorFlow was not compiled with CUDA 11.2+.";
71   return false;
72 #endif
73 }
74 
singleton(GPUProcessState * ps)75 /*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
76   static GPUProcessState* instance = ps ? ps : new GPUProcessState;
77   DCHECK((!ps) || (ps == instance))
78       << "Multiple calls to GPUProcessState with non-null ps";
79   return instance;
80 }
81 
GPUProcessState()82 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
83   process_state_ = ProcessState::singleton();
84 }
85 
BusIdForGPU(TfDeviceId tf_device_id)86 int GPUProcessState::BusIdForGPU(TfDeviceId tf_device_id) {
87   // Return the NUMA node associated with the GPU's StreamExecutor.
88   se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
89                                DEVICE_GPU, GPUMachineManager(), tf_device_id)
90                                .ValueOrDie();
91   int numa_node = se->GetDeviceDescription().numa_node();
92   // bus_id must be non-negative.  If the numa_node is not known,
93   // use 0.
94   return numa_node >= 0 ? numa_node : 0;
95 }
96 
97 // NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
CreateSubAllocator(const GPUOptions & options,PlatformDeviceId platform_device_id,const std::vector<SubAllocator::Visitor> & alloc_visitors,size_t total_bytes,const std::vector<TfDeviceId> & peer_gpu_ids)98 static SubAllocator* CreateSubAllocator(
99     const GPUOptions& options, PlatformDeviceId platform_device_id,
100     const std::vector<SubAllocator::Visitor>& alloc_visitors,
101     size_t total_bytes, const std::vector<TfDeviceId>& peer_gpu_ids) {
102   auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
103                                                             platform_device_id)
104                       .ValueOrDie();
105 
106   // FIXME(imintz): Observed OOM issues when using the virtual memory
107   // allocators. This should be reenabled when resolved.
108 #if 0 && defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
109   // Use the old allocator when unified memory is required.
110   // TODO(imintz): Remove the cuMemAlloc capability of this allocator.
111   if (options.per_process_gpu_memory_fraction() > 1.0 ||
112       options.experimental().use_unified_memory()) {
113     return new DeviceMemAllocator(executor, platform_device_id,
114                                   /*use_unified_memory=*/true, alloc_visitors,
115                                   {});
116   } else {
117     auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
118         executor->implementation()->GpuContextHack());
119 
120     absl::flat_hash_set<PlatformDeviceId> platform_peer_gpu_ids;
121     platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
122     for (const TfDeviceId tf_device_id : peer_gpu_ids) {
123       PlatformDeviceId platform_device_id;
124       TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
125       platform_peer_gpu_ids.insert(platform_device_id);
126     }
127     std::vector<PlatformDeviceId> platform_peer_gpu_ids_vec(
128         platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
129 
130     // Adjust virtual address space to be slightly larger than the physical
131     // address space in case the BFC allocator performs suboptimal garbage
132     // collection.
133     // TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
134     // the va space.
135     return GpuVirtualMemAllocator::Create(
136                alloc_visitors, {}, *gpu_context, platform_device_id,
137                /*virtual_address_space_size=*/total_bytes * 2,
138                platform_peer_gpu_ids_vec)
139         .ValueOrDie()
140         .release();
141   }
142 #else
143   return new DeviceMemAllocator(
144       executor, platform_device_id,
145       (options.per_process_gpu_memory_fraction() > 1.0 ||
146        options.experimental().use_unified_memory()),
147       alloc_visitors, {});
148 #endif
149 }
150 
GetGPUAllocator(const GPUOptions & options,TfDeviceId tf_device_id,size_t total_bytes,const std::vector<TfDeviceId> & peer_gpu_ids)151 Allocator* GPUProcessState::GetGPUAllocator(
152     const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
153     const std::vector<TfDeviceId>& peer_gpu_ids) {
154   CHECK(process_state_);
155 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
156     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
157   const string& allocator_type = options.allocator_type();
158   mutex_lock lock(mu_);
159   DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
160                                      tf_device_id);
161 
162   if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
163     gpu_allocators_.resize(tf_device_id.value() + 1);
164   }
165 
166   AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
167   if (allocator_parts.allocator == nullptr) {
168     // Validate allocator types.
169     if (!allocator_type.empty() && allocator_type != "BFC") {
170       LOG(ERROR) << "Invalid allocator type: " << allocator_type;
171       return nullptr;
172     }
173 
174     PlatformDeviceId platform_device_id;
175     TF_CHECK_OK(
176         GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
177     int bus_id = BusIdForGPU(tf_device_id);
178     DCHECK_GE(bus_id, 0);
179     while (bus_id >= gpu_visitors_.size()) {
180       gpu_visitors_.push_back({});
181     }
182     auto* sub_allocator =
183         CreateSubAllocator(options, platform_device_id, gpu_visitors_[bus_id],
184                            total_bytes, peer_gpu_ids);
185     GPUBFCAllocator* gpu_bfc_allocator = new GPUBFCAllocator(
186         sub_allocator, total_bytes, options,
187         strings::StrCat("GPU_", tf_device_id.value(), "_bfc"),
188         options.experimental().internal_fragmentation_fraction());
189     Allocator* gpu_allocator = gpu_bfc_allocator;
190 
191     SharedCounter* timing_counter = nullptr;
192     if (options.experimental().timestamped_allocator()) {
193       timing_counter = new SharedCounter;
194       gpu_bfc_allocator->SetTimingCounter(timing_counter);
195     }
196 
197     // If true, checks for memory overwrites by writing
198     // distinctive patterns on both ends of allocated memory.
199     if (UseCudaMemoryGuardAllocator()) {
200       LOG(INFO) << "Using memory guard allocator for GPU.";
201       gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_device_id);
202       gpu_allocator =
203           new GPUNanResetAllocator(gpu_allocator, platform_device_id);
204     } else if (UseCudaMallocAllocator()) {
205       LOG(INFO) << "Using CUDA malloc allocator for GPU.";
206       // If true, passes all allocation requests through to cudaMalloc
207       // useful for doing memory debugging with tools like cuda-memcheck
208       // **WARNING** probably will not work in a multi-gpu scenario
209       delete gpu_bfc_allocator;
210       delete sub_allocator;
211       gpu_bfc_allocator = nullptr;
212       sub_allocator = nullptr;
213       gpu_allocator = new GPUcudaMallocAllocator(platform_device_id);
214     } else if (UseCudaMallocAsyncAllocator()) {
215       LOG(INFO) << "Using CUDA malloc Async allocator for GPU: "
216                 << platform_device_id;
217       // If true, passes all allocation requests through to cudaMallocAsync
218       // TODO: useful for doing memory debugging with tools like
219       // compute-sanitizer.
220       // TODO: **WARNING** probably will not work in a multi-gpu scenario
221       delete gpu_bfc_allocator;
222       delete sub_allocator;
223       gpu_bfc_allocator = nullptr;
224       sub_allocator = nullptr;
225       gpu_allocator =
226           new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
227     }
228 
229     Allocator* recording_allocator = nullptr;
230     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
231       ProcessState::MemDesc md;
232       md.loc = ProcessState::MemDesc::GPU;
233       md.dev_index = platform_device_id.value();
234       md.gpu_registered = false;
235       md.nic_registered = true;
236       recording_allocator = new internal::RecordingAllocator(
237           &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
238     }
239     allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator),
240                        std::unique_ptr<SharedCounter>(timing_counter),
241                        gpu_bfc_allocator, sub_allocator,
242                        std::unique_ptr<Allocator>(recording_allocator)};
243   }
244   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
245     return allocator_parts.recording_allocator.get();
246   } else {
247     return allocator_parts.allocator.get();
248   }
249 #else
250   LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda or "
251                 "--config=rocm.";
252   return nullptr;
253 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
254 }
255 
GPUAllocatorCounter(TfDeviceId tf_device_id)256 SharedCounter* GPUProcessState::GPUAllocatorCounter(TfDeviceId tf_device_id) {
257   DCHECK(process_state_);
258 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
259     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
260   DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
261                                      tf_device_id);
262   mutex_lock l(mu_);
263   if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
264     LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
265                << " but only have " << gpu_allocators_.size();
266     return nullptr;
267   }
268 
269   AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
270   if (allocator_parts.counter.get() == nullptr) {
271     if (allocator_parts.bfc_allocator == nullptr) {
272       return nullptr;
273     }
274     SharedCounter* timing_counter = new SharedCounter;
275     allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
276     allocator_parts.counter.reset(timing_counter);
277   }
278   return allocator_parts.counter.get();
279 #else
280   return nullptr;
281 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
282 }
283 
GetGpuHostAllocator(int numa_node)284 Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
285   CHECK(process_state_);
286   if (!HasGPUDevice() ||
287       !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
288     return process_state_->GetCPUAllocator(numa_node);
289   }
290   if (numa_node == port::kNUMANoAffinity) {
291     numa_node = 0;
292   }
293   {
294     // Here we optimize the most common use case where gpu_host_allocators_
295     // have already been populated and since we're only reading
296     // these vectors, we can get by with a shared lock. In the slower case,
297     // we take a unique lock and populate these vectors.
298     tf_shared_lock lock(mu_);
299 
300     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
301         !gpu_host_allocators_.empty() &&
302         gpu_host_allocators_[0].recording_allocator != nullptr) {
303       return gpu_host_allocators_[0].recording_allocator.get();
304     }
305     if (static_cast<int>(gpu_host_allocators_.size()) > numa_node) {
306       return gpu_host_allocators_[0].allocator.get();
307     }
308   }
309 
310   mutex_lock lock(mu_);
311   // Find the first valid StreamExecutor to request CUDA or ROCm host memory
312   // through, since any will work.
313   //
314   // This search isn't super clean, and it would be nice to use a
315   // better source of information about which executor to use.  For
316   // example, process_state could maybe save the first stream executor
317   // it knows is valid.
318   se::StreamExecutor* se = nullptr;
319   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
320     if (gpu_allocators_[i].allocator != nullptr) {
321       se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
322                                                TfDeviceId(i))
323                .ValueOrDie();
324       break;
325     }
326   }
327 
328   CHECK_NE(nullptr, se);
329 
330   while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
331     while (gpu_host_alloc_visitors_.size() <= numa_node) {
332       gpu_host_alloc_visitors_.push_back({});
333     }
334     while (gpu_host_free_visitors_.size() <= numa_node) {
335       gpu_host_free_visitors_.push_back({});
336     }
337     SubAllocator* sub_allocator = new DeviceHostAllocator(
338         se, numa_node, gpu_host_alloc_visitors_[numa_node],
339         gpu_host_free_visitors_[numa_node]);
340     // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
341     int64_t gpu_host_mem_limit_in_mb = -1;
342     Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
343                                         1LL << 16 /*64GB max by default*/,
344                                         &gpu_host_mem_limit_in_mb);
345     if (!status.ok()) {
346       LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
347     }
348     int64_t gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
349 
350     Allocator* allocator =
351         new BFCAllocator(sub_allocator, gpu_host_mem_limit,
352                          /*allow_growth=*/true, /*name=*/"gpu_host_bfc");
353 
354     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
355       // Wrap the allocator to track allocation ids for better logging
356       // at the cost of performance.
357       allocator = new TrackingAllocator(allocator, true);
358     }
359     gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
360                                     std::unique_ptr<SharedCounter>(nullptr),
361                                     nullptr, sub_allocator,
362                                     std::unique_ptr<Allocator>(nullptr)});
363     AllocatorParts& allocator_parts = gpu_host_allocators_.back();
364     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
365       ProcessState::MemDesc md;
366       md.loc = ProcessState::MemDesc::CPU;
367       md.dev_index = 0;
368       md.gpu_registered = true;
369       md.nic_registered = false;
370       allocator_parts.recording_allocator.reset(
371           new internal::RecordingAllocator(&process_state_->mem_desc_map_,
372                                            allocator_parts.allocator.get(), md,
373                                            &mu_));
374     }
375   }
376   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
377     return gpu_host_allocators_[0].recording_allocator.get();
378   } else {
379     return gpu_host_allocators_[0].allocator.get();
380   }
381 }
382 
AddGPUAllocVisitor(int bus_id,const SubAllocator::Visitor & visitor)383 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
384                                          const SubAllocator::Visitor& visitor) {
385 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
386     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
387   mutex_lock lock(mu_);
388   CHECK(gpu_allocators_.empty())  // Crash OK
389       << "AddGPUAllocVisitor must be called before "
390          "first call to GetGPUAllocator.";
391   DCHECK_GE(bus_id, 0);
392   while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
393     gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
394   }
395   gpu_visitors_[bus_id].push_back(visitor);
396 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
397 }
398 
AddGpuHostAllocVisitor(int numa_node,const SubAllocator::Visitor & visitor)399 void GPUProcessState::AddGpuHostAllocVisitor(
400     int numa_node, const SubAllocator::Visitor& visitor) {
401 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
402     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
403   mutex_lock lock(mu_);
404   CHECK(gpu_host_allocators_.empty())  // Crash OK
405       << "AddGpuHostAllocVisitor must be called before "
406          "first call to GetGpuHostAllocator.";
407   while (numa_node >= static_cast<int64>(gpu_host_alloc_visitors_.size())) {
408     gpu_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
409   }
410   gpu_host_alloc_visitors_[numa_node].push_back(visitor);
411 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
412 }
413 
AddGpuHostFreeVisitor(int numa_node,const SubAllocator::Visitor & visitor)414 void GPUProcessState::AddGpuHostFreeVisitor(
415     int numa_node, const SubAllocator::Visitor& visitor) {
416 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
417     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
418   mutex_lock lock(mu_);
419   CHECK(gpu_host_allocators_.empty())  // Crash OK
420       << "AddGpuHostFreeVisitor must be called before "
421          "first call to GetGpuHostAllocator.";
422   while (numa_node >= static_cast<int64>(gpu_host_free_visitors_.size())) {
423     gpu_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
424   }
425   gpu_host_free_visitors_[numa_node].push_back(visitor);
426 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
427 }
428 
TestOnlyReset()429 void GPUProcessState::TestOnlyReset() {
430   if (process_state_) {
431     process_state_->ProcessState::TestOnlyReset();
432   }
433   {
434     mutex_lock lock(mu_);
435     gpu_device_enabled_ = false;
436     gpu_allocators_.clear();
437     gpu_visitors_.clear();
438     gpu_host_allocators_.clear();
439     gpu_host_alloc_visitors_.clear();
440     gpu_host_free_visitors_.clear();
441   }
442 }
443 
444 }  // namespace tensorflow
445