• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
17 
18 #include <cstring>
19 #include <vector>
20 
21 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
22 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
23 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
27 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
28 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
29 #include "tensorflow/core/common_runtime/pool_allocator.h"
30 #include "tensorflow/core/common_runtime/shared_counter.h"
31 #include "tensorflow/core/framework/allocator.h"
32 #include "tensorflow/core/framework/log_memory.h"
33 #include "tensorflow/core/framework/tracking_allocator.h"
34 #include "tensorflow/core/lib/strings/strcat.h"
35 #include "tensorflow/core/platform/logging.h"
36 #include "tensorflow/core/platform/mutex.h"
37 #include "tensorflow/core/platform/stream_executor.h"
38 #include "tensorflow/core/platform/types.h"
39 #include "tensorflow/core/util/env_var.h"
40 
41 namespace tensorflow {
42 namespace {
43 
useCudaMallocAllocator()44 bool useCudaMallocAllocator() {
45   const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
46   return debug_allocator_str != nullptr &&
47          std::strcmp(debug_allocator_str, "cuda_malloc") == 0;
48 }
49 
useCudaMemoryGuardAllocator()50 bool useCudaMemoryGuardAllocator() {
51   const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
52   return debug_allocator_str != nullptr &&
53          std::strcmp(debug_allocator_str, "memory_guard") == 0;
54 }
55 
56 }  // namespace
57 
singleton(GPUProcessState * ps)58 /*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
59   static GPUProcessState* instance = ps ? ps : new GPUProcessState;
60   DCHECK((!ps) || (ps == instance))
61       << "Multiple calls to GPUProcessState with non-null ps";
62   return instance;
63 }
64 
GPUProcessState()65 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
66   process_state_ = ProcessState::singleton();
67 }
68 
BusIdForGPU(TfGpuId tf_gpu_id)69 int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
70   // Return the NUMA node associated with the GPU's StreamExecutor.
71   se::StreamExecutor* se =
72       GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
73   int numa_node = se->GetDeviceDescription().numa_node();
74   // bus_id must be non-negative.  If the numa_node is not known,
75   // use 0.
76   return numa_node >= 0 ? numa_node : 0;
77 }
78 
GetGPUAllocator(const GPUOptions & options,TfGpuId tf_gpu_id,size_t total_bytes)79 Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
80                                             TfGpuId tf_gpu_id,
81                                             size_t total_bytes) {
82   CHECK(process_state_);
83 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
84     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
85   const string& allocator_type = options.allocator_type();
86   mutex_lock lock(mu_);
87   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
88 
89   if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
90     gpu_allocators_.resize(tf_gpu_id.value() + 1);
91   }
92 
93   AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
94   if (allocator_parts.allocator == nullptr) {
95     // Validate allocator types.
96     if (!allocator_type.empty() && allocator_type != "BFC") {
97       LOG(ERROR) << "Invalid allocator type: " << allocator_type;
98       return nullptr;
99     }
100 
101     PlatformGpuId platform_gpu_id;
102     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
103     int bus_id = BusIdForGPU(tf_gpu_id);
104     DCHECK_GE(bus_id, 0);
105     while (bus_id >= gpu_visitors_.size()) {
106       gpu_visitors_.push_back({});
107     }
108     GPUMemAllocator* sub_allocator = new GPUMemAllocator(
109         GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
110         platform_gpu_id,
111         (options.per_process_gpu_memory_fraction() > 1.0 ||
112          options.experimental().use_unified_memory()),
113         gpu_visitors_[bus_id], {});
114     GPUBFCAllocator* gpu_bfc_allocator =
115         new GPUBFCAllocator(sub_allocator, total_bytes, options,
116                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
117     Allocator* gpu_allocator = gpu_bfc_allocator;
118     SharedCounter* timing_counter = nullptr;
119     if (options.experimental().timestamped_allocator()) {
120       timing_counter = new SharedCounter;
121       gpu_bfc_allocator->SetTimingCounter(timing_counter);
122     }
123 
124     // If true, checks for memory overwrites by writing
125     // distinctive patterns on both ends of allocated memory.
126     if (useCudaMemoryGuardAllocator()) {
127       LOG(INFO) << "Using memory guard allocator for GPU.";
128       gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_gpu_id);
129       gpu_allocator = new GPUNanResetAllocator(gpu_allocator, platform_gpu_id);
130     } else if (useCudaMallocAllocator()) {
131       LOG(INFO) << "Using CUDA malloc allocator for GPU.";
132       // If true, passes all allocation requests through to cudaMalloc
133       // useful for doing memory debugging with tools like cuda-memcheck
134       // **WARNING** probably will not work in a multi-gpu scenario
135       gpu_allocator =
136           new GPUcudaMallocAllocator(gpu_allocator, platform_gpu_id);
137     }
138 
139     Allocator* recording_allocator = nullptr;
140     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
141       ProcessState::MemDesc md;
142       md.loc = ProcessState::MemDesc::GPU;
143       md.dev_index = platform_gpu_id.value();
144       md.gpu_registered = false;
145       md.nic_registered = true;
146       recording_allocator = new internal::RecordingAllocator(
147           &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
148     }
149     allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator),
150                        std::unique_ptr<SharedCounter>(timing_counter),
151                        gpu_bfc_allocator, sub_allocator,
152                        std::unique_ptr<Allocator>(recording_allocator)};
153   }
154   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
155     return allocator_parts.recording_allocator.get();
156   } else {
157     return allocator_parts.allocator.get();
158   }
159 #else
160   LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda or "
161                 "--config=rocm.";
162   return nullptr;
163 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
164 }
165 
GPUAllocatorCounter(TfGpuId tf_gpu_id)166 SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
167   DCHECK(process_state_);
168 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
169     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
170   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
171   mutex_lock l(mu_);
172   if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
173     LOG(ERROR) << "Asked for counter for GPU allocator " << tf_gpu_id.value()
174                << " but only have " << gpu_allocators_.size();
175     return nullptr;
176   }
177 
178   AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
179   if (allocator_parts.counter.get() == nullptr) {
180     SharedCounter* timing_counter = new SharedCounter;
181     allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
182     allocator_parts.counter.reset(timing_counter);
183   }
184   return allocator_parts.counter.get();
185 #else
186   return nullptr;
187 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
188 }
189 
GetGpuHostAllocator(int numa_node)190 Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
191   CHECK(process_state_);
192   if (!HasGPUDevice() ||
193       !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
194     return process_state_->GetCPUAllocator(numa_node);
195   }
196   if (numa_node == port::kNUMANoAffinity) {
197     numa_node = 0;
198   }
199   {
200     // Here we optimize the most common use case where gpu_host_allocators_
201     // have already been populated and since we're only reading
202     // these vectors, we can get by with a shared lock. In the slower case,
203     // we take a unique lock and populate these vectors.
204     tf_shared_lock lock(mu_);
205 
206     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
207         !gpu_host_allocators_.empty() &&
208         gpu_host_allocators_[0].recording_allocator != nullptr) {
209       return gpu_host_allocators_[0].recording_allocator.get();
210     }
211     if (static_cast<int>(gpu_host_allocators_.size()) > numa_node) {
212       return gpu_host_allocators_[0].allocator.get();
213     }
214   }
215 
216   mutex_lock lock(mu_);
217   // Find the first valid StreamExecutor to request CUDA or ROCm host memory
218   // through, since any will work.
219   //
220   // This search isn't super clean, and it would be nice to use a
221   // better source of information about which executor to use.  For
222   // example, process_state could maybe save the first stream executor
223   // it knows is valid.
224   se::StreamExecutor* se = nullptr;
225   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
226     if (gpu_allocators_[i].allocator != nullptr) {
227       se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
228       break;
229     }
230   }
231 
232   CHECK_NE(nullptr, se);
233 
234   while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
235     while (gpu_host_alloc_visitors_.size() <= numa_node) {
236       gpu_host_alloc_visitors_.push_back({});
237     }
238     while (gpu_host_free_visitors_.size() <= numa_node) {
239       gpu_host_free_visitors_.push_back({});
240     }
241     SubAllocator* sub_allocator =
242         new GpuHostAllocator(se, numa_node, gpu_host_alloc_visitors_[numa_node],
243                              gpu_host_free_visitors_[numa_node]);
244     // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
245     int64 gpu_host_mem_limit_in_mb = -1;
246     Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
247                                         1LL << 16 /*64GB max by default*/,
248                                         &gpu_host_mem_limit_in_mb);
249     if (!status.ok()) {
250       LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
251     }
252     int64 gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
253 
254     Allocator* allocator =
255         new BFCAllocator(sub_allocator, gpu_host_mem_limit,
256                          true /*allow_growth*/, "gpu_host_bfc" /*name*/);
257 
258     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
259       // Wrap the allocator to track allocation ids for better logging
260       // at the cost of performance.
261       allocator = new TrackingAllocator(allocator, true);
262     }
263     gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
264                                     std::unique_ptr<SharedCounter>(nullptr),
265                                     nullptr, sub_allocator,
266                                     std::unique_ptr<Allocator>(nullptr)});
267     AllocatorParts& allocator_parts = gpu_host_allocators_.back();
268     if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
269       ProcessState::MemDesc md;
270       md.loc = ProcessState::MemDesc::CPU;
271       md.dev_index = 0;
272       md.gpu_registered = true;
273       md.nic_registered = false;
274       allocator_parts.recording_allocator.reset(
275           new internal::RecordingAllocator(&process_state_->mem_desc_map_,
276                                            allocator_parts.allocator.get(), md,
277                                            &mu_));
278     }
279   }
280   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
281     return gpu_host_allocators_[0].recording_allocator.get();
282   } else {
283     return gpu_host_allocators_[0].allocator.get();
284   }
285 }
286 
AddGPUAllocVisitor(int bus_id,const SubAllocator::Visitor & visitor)287 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
288                                          const SubAllocator::Visitor& visitor) {
289 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
290     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
291   mutex_lock lock(mu_);
292   CHECK(gpu_allocators_.empty())  // Crash OK
293       << "AddGPUAllocVisitor must be called before "
294          "first call to GetGPUAllocator.";
295   DCHECK_GE(bus_id, 0);
296   while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
297     gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
298   }
299   gpu_visitors_[bus_id].push_back(visitor);
300 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
301 }
302 
AddGpuHostAllocVisitor(int numa_node,const SubAllocator::Visitor & visitor)303 void GPUProcessState::AddGpuHostAllocVisitor(
304     int numa_node, const SubAllocator::Visitor& visitor) {
305 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
306     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
307   mutex_lock lock(mu_);
308   CHECK(gpu_host_allocators_.empty())  // Crash OK
309       << "AddGpuHostAllocVisitor must be called before "
310          "first call to GetGpuHostAllocator.";
311   while (numa_node >= static_cast<int64>(gpu_host_alloc_visitors_.size())) {
312     gpu_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
313   }
314   gpu_host_alloc_visitors_[numa_node].push_back(visitor);
315 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
316 }
317 
AddGpuHostFreeVisitor(int numa_node,const SubAllocator::Visitor & visitor)318 void GPUProcessState::AddGpuHostFreeVisitor(
319     int numa_node, const SubAllocator::Visitor& visitor) {
320 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
321     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
322   mutex_lock lock(mu_);
323   CHECK(gpu_host_allocators_.empty())  // Crash OK
324       << "AddGpuHostFreeVisitor must be called before "
325          "first call to GetGpuHostAllocator.";
326   while (numa_node >= static_cast<int64>(gpu_host_free_visitors_.size())) {
327     gpu_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
328   }
329   gpu_host_free_visitors_[numa_node].push_back(visitor);
330 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
331 }
332 
TestOnlyReset()333 void GPUProcessState::TestOnlyReset() {
334   if (process_state_) {
335     process_state_->ProcessState::TestOnlyReset();
336   }
337   {
338     mutex_lock lock(mu_);
339     gpu_device_enabled_ = false;
340     gpu_allocators_.clear();
341     gpu_visitors_.clear();
342     gpu_host_allocators_.clear();
343     gpu_host_alloc_visitors_.clear();
344     gpu_host_free_visitors_.clear();
345   }
346 }
347 
348 }  // namespace tensorflow
349