• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2  
3  Licensed under the Apache License, Version 2.0 (the "License");
4  you may not use this file except in compliance with the License.
5  You may obtain a copy of the License at
6  
7      http://www.apache.org/licenses/LICENSE-2.0
8  
9  Unless required by applicable law or agreed to in writing, software
10  distributed under the License is distributed on an "AS IS" BASIS,
11  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  See the License for the specific language governing permissions and
13  limitations under the License.
14  ==============================================================================*/
15  
16  #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
17  
18  #include <cstring>
19  #include <vector>
20  
21  #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
22  #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
23  #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
24  #include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h"
25  #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26  #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
27  #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
28  #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
29  #include "tensorflow/core/common_runtime/pool_allocator.h"
30  #include "tensorflow/core/common_runtime/shared_counter.h"
31  #include "tensorflow/core/framework/allocator.h"
32  #include "tensorflow/core/framework/log_memory.h"
33  #include "tensorflow/core/framework/tracking_allocator.h"
34  #include "tensorflow/core/lib/strings/strcat.h"
35  #include "tensorflow/core/platform/logging.h"
36  #include "tensorflow/core/platform/mutex.h"
37  #include "tensorflow/core/platform/stream_executor.h"
38  #include "tensorflow/core/platform/types.h"
39  #include "tensorflow/core/util/env_var.h"
40  
41  namespace tensorflow {
42  namespace {
43  
useCudaMallocAllocator()44  bool useCudaMallocAllocator() {
45    const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
46    return debug_allocator_str != nullptr &&
47           std::strcmp(debug_allocator_str, "cuda_malloc") == 0;
48  }
49  
useCudaMemoryGuardAllocator()50  bool useCudaMemoryGuardAllocator() {
51    const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
52    return debug_allocator_str != nullptr &&
53           std::strcmp(debug_allocator_str, "memory_guard") == 0;
54  }
55  
56  }  // namespace
57  
singleton(GPUProcessState * ps)58  /*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
59    static GPUProcessState* instance = ps ? ps : new GPUProcessState;
60    DCHECK((!ps) || (ps == instance))
61        << "Multiple calls to GPUProcessState with non-null ps";
62    return instance;
63  }
64  
GPUProcessState()65  GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
66    process_state_ = ProcessState::singleton();
67  }
68  
BusIdForGPU(TfGpuId tf_gpu_id)69  int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
70    // Return the NUMA node associated with the GPU's StreamExecutor.
71    se::StreamExecutor* se =
72        GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
73    int numa_node = se->GetDeviceDescription().numa_node();
74    // bus_id must be non-negative.  If the numa_node is not known,
75    // use 0.
76    return numa_node >= 0 ? numa_node : 0;
77  }
78  
GetGPUAllocator(const GPUOptions & options,TfGpuId tf_gpu_id,size_t total_bytes)79  Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
80                                              TfGpuId tf_gpu_id,
81                                              size_t total_bytes) {
82    CHECK(process_state_);
83  #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
84      (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
85    const string& allocator_type = options.allocator_type();
86    mutex_lock lock(mu_);
87    GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
88  
89    if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
90      gpu_allocators_.resize(tf_gpu_id.value() + 1);
91    }
92  
93    AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
94    if (allocator_parts.allocator == nullptr) {
95      // Validate allocator types.
96      if (!allocator_type.empty() && allocator_type != "BFC") {
97        LOG(ERROR) << "Invalid allocator type: " << allocator_type;
98        return nullptr;
99      }
100  
101      PlatformGpuId platform_gpu_id;
102      TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
103      int bus_id = BusIdForGPU(tf_gpu_id);
104      DCHECK_GE(bus_id, 0);
105      while (bus_id >= gpu_visitors_.size()) {
106        gpu_visitors_.push_back({});
107      }
108      GPUMemAllocator* sub_allocator = new GPUMemAllocator(
109          GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
110          platform_gpu_id,
111          (options.per_process_gpu_memory_fraction() > 1.0 ||
112           options.experimental().use_unified_memory()),
113          gpu_visitors_[bus_id], {});
114      GPUBFCAllocator* gpu_bfc_allocator =
115          new GPUBFCAllocator(sub_allocator, total_bytes, options,
116                              strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
117      Allocator* gpu_allocator = gpu_bfc_allocator;
118      SharedCounter* timing_counter = nullptr;
119      if (options.experimental().timestamped_allocator()) {
120        timing_counter = new SharedCounter;
121        gpu_bfc_allocator->SetTimingCounter(timing_counter);
122      }
123  
124      // If true, checks for memory overwrites by writing
125      // distinctive patterns on both ends of allocated memory.
126      if (useCudaMemoryGuardAllocator()) {
127        LOG(INFO) << "Using memory guard allocator for GPU.";
128        gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id);
129        gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id);
130      } else if (useCudaMallocAllocator()) {
131        LOG(INFO) << "Using CUDA malloc allocator for GPU.";
132        // If true, passes all allocation requests through to cudaMalloc
133        // useful for doing memory debugging with tools like cuda-memcheck
134        // **WARNING** probably will not work in a multi-gpu scenario
135        gpu_allocator =
136            new GPUcudaMallocAllocator(gpu_allocator, platform_gpu_id);
137      }
138  
139      Allocator* recording_allocator = nullptr;
140      if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
141        ProcessState::MemDesc md;
142        md.loc = ProcessState::MemDesc::GPU;
143        md.dev_index = platform_gpu_id.value();
144        md.gpu_registered = false;
145        md.nic_registered = true;
146        recording_allocator = new internal::RecordingAllocator(
147            &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
148      }
149      allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator),
150                         std::unique_ptr<SharedCounter>(timing_counter),
151                         gpu_bfc_allocator, sub_allocator,
152                         std::unique_ptr<Allocator>(recording_allocator)};
153    }
154    if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
155      return allocator_parts.recording_allocator.get();
156    } else {
157      return allocator_parts.allocator.get();
158    }
159  #else
160    LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda or "
161                  "--config=rocm.";
162    return nullptr;
163  #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
164  }
165  
GPUAllocatorCounter(TfGpuId tf_gpu_id)166  SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
167    DCHECK(process_state_);
168  #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
169      (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
170    GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
171    mutex_lock l(mu_);
172    if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
173      LOG(ERROR) << "Asked for counter for GPU allocator " << tf_gpu_id.value()
174                 << " but only have " << gpu_allocators_.size();
175      return nullptr;
176    }
177  
178    AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
179    if (allocator_parts.counter.get() == nullptr) {
180      SharedCounter* timing_counter = new SharedCounter;
181      allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
182      allocator_parts.counter.reset(timing_counter);
183    }
184    return allocator_parts.counter.get();
185  #else
186    return nullptr;
187  #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
188  }
189  
GetGpuHostAllocator(int numa_node)190  Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
191    CHECK(process_state_);
192    if (!HasGPUDevice() ||
193        !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
194      return process_state_->GetCPUAllocator(numa_node);
195    }
196    if (numa_node == port::kNUMANoAffinity) {
197      numa_node = 0;
198    }
199    {
200      // Here we optimize the most common use case where gpu_host_allocators_
201      // have already been populated and since we're only reading
202      // these vectors, we can get by with a shared lock. In the slower case,
203      // we take a unique lock and populate these vectors.
204      tf_shared_lock lock(mu_);
205  
206      if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
207          !gpu_host_allocators_.empty() &&
208          gpu_host_allocators_[0].recording_allocator != nullptr) {
209        return gpu_host_allocators_[0].recording_allocator.get();
210      }
211      if (static_cast<int>(gpu_host_allocators_.size()) > numa_node) {
212        return gpu_host_allocators_[0].allocator.get();
213      }
214    }
215  
216    mutex_lock lock(mu_);
217    // Find the first valid StreamExecutor to request CUDA or ROCm host memory
218    // through, since any will work.
219    //
220    // This search isn't super clean, and it would be nice to use a
221    // better source of information about which executor to use.  For
222    // example, process_state could maybe save the first stream executor
223    // it knows is valid.
224    se::StreamExecutor* se = nullptr;
225    for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
226      if (gpu_allocators_[i].allocator != nullptr) {
227        se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
228        break;
229      }
230    }
231  
232    CHECK_NE(nullptr, se);
233  
234    while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
235      while (gpu_host_alloc_visitors_.size() <= numa_node) {
236        gpu_host_alloc_visitors_.push_back({});
237      }
238      while (gpu_host_free_visitors_.size() <= numa_node) {
239        gpu_host_free_visitors_.push_back({});
240      }
241      SubAllocator* sub_allocator =
242          new GpuHostAllocator(se, numa_node, gpu_host_alloc_visitors_[numa_node],
243                               gpu_host_free_visitors_[numa_node]);
244      // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
245      int64 gpu_host_mem_limit_in_mb = -1;
246      Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
247                                          1LL << 16 /*64GB max by default*/,
248                                          &gpu_host_mem_limit_in_mb);
249      if (!status.ok()) {
250        LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
251      }
252      int64 gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
253  
254      Allocator* allocator =
255          new BFCAllocator(sub_allocator, gpu_host_mem_limit,
256                           true /*allow_growth*/, "gpu_host_bfc" /*name*/);
257  
258      if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
259        // Wrap the allocator to track allocation ids for better logging
260        // at the cost of performance.
261        allocator = new TrackingAllocator(allocator, true);
262      }
263      gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
264                                      std::unique_ptr<SharedCounter>(nullptr),
265                                      nullptr, sub_allocator,
266                                      std::unique_ptr<Allocator>(nullptr)});
267      AllocatorParts& allocator_parts = gpu_host_allocators_.back();
268      if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
269        ProcessState::MemDesc md;
270        md.loc = ProcessState::MemDesc::CPU;
271        md.dev_index = 0;
272        md.gpu_registered = true;
273        md.nic_registered = false;
274        allocator_parts.recording_allocator.reset(
275            new internal::RecordingAllocator(&process_state_->mem_desc_map_,
276                                             allocator_parts.allocator.get(), md,
277                                             &mu_));
278      }
279    }
280    if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
281      return gpu_host_allocators_[0].recording_allocator.get();
282    } else {
283      return gpu_host_allocators_[0].allocator.get();
284    }
285  }
286  
AddGPUAllocVisitor(int bus_id,const SubAllocator::Visitor & visitor)287  void GPUProcessState::AddGPUAllocVisitor(int bus_id,
288                                           const SubAllocator::Visitor& visitor) {
289  #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
290      (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
291    mutex_lock lock(mu_);
292    CHECK(gpu_allocators_.empty())  // Crash OK
293        << "AddGPUAllocVisitor must be called before "
294           "first call to GetGPUAllocator.";
295    DCHECK_GE(bus_id, 0);
296    while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
297      gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
298    }
299    gpu_visitors_[bus_id].push_back(visitor);
300  #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
301  }
302  
AddGpuHostAllocVisitor(int numa_node,const SubAllocator::Visitor & visitor)303  void GPUProcessState::AddGpuHostAllocVisitor(
304      int numa_node, const SubAllocator::Visitor& visitor) {
305  #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
306      (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
307    mutex_lock lock(mu_);
308    CHECK(gpu_host_allocators_.empty())  // Crash OK
309        << "AddGpuHostAllocVisitor must be called before "
310           "first call to GetGpuHostAllocator.";
311    while (numa_node >= static_cast<int64>(gpu_host_alloc_visitors_.size())) {
312      gpu_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
313    }
314    gpu_host_alloc_visitors_[numa_node].push_back(visitor);
315  #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
316  }
317  
AddGpuHostFreeVisitor(int numa_node,const SubAllocator::Visitor & visitor)318  void GPUProcessState::AddGpuHostFreeVisitor(
319      int numa_node, const SubAllocator::Visitor& visitor) {
320  #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
321      (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
322    mutex_lock lock(mu_);
323    CHECK(gpu_host_allocators_.empty())  // Crash OK
324        << "AddGpuHostFreeVisitor must be called before "
325           "first call to GetGpuHostAllocator.";
326    while (numa_node >= static_cast<int64>(gpu_host_free_visitors_.size())) {
327      gpu_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
328    }
329    gpu_host_free_visitors_[numa_node].push_back(visitor);
330  #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
331  }
332  
TestOnlyReset()333  void GPUProcessState::TestOnlyReset() {
334    if (process_state_) {
335      process_state_->ProcessState::TestOnlyReset();
336    }
337    {
338      mutex_lock lock(mu_);
339      gpu_device_enabled_ = false;
340      gpu_allocators_.clear();
341      gpu_visitors_.clear();
342      gpu_host_allocators_.clear();
343      gpu_host_alloc_visitors_.clear();
344      gpu_host_free_visitors_.clear();
345    }
346  }
347  
348  }  // namespace tensorflow
349