1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
17
18 #include <cstring>
19 #include <vector>
20
21 #include "absl/container/flat_hash_set.h"
22 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
23 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
24 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
25 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
26 #include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
27 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
28 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
29 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
30 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
31 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
32 #include "tensorflow/core/common_runtime/pool_allocator.h"
33 #include "tensorflow/core/common_runtime/shared_counter.h"
34 #include "tensorflow/core/framework/allocator.h"
35 #include "tensorflow/core/framework/log_memory.h"
36 #include "tensorflow/core/framework/tracking_allocator.h"
37 #include "tensorflow/core/lib/strings/strcat.h"
38 #include "tensorflow/core/platform/logging.h"
39 #include "tensorflow/core/platform/mutex.h"
40 #include "tensorflow/core/platform/stream_executor.h"
41 #include "tensorflow/core/platform/types.h"
42 #include "tensorflow/core/util/env_var.h"
43
44 namespace tensorflow {
45
46 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMallocAllocator()47 static bool UseCudaMallocAllocator() {
48 const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
49 return allocator_env != nullptr &&
50 std::strcmp(allocator_env, "cuda_malloc") == 0;
51 }
52
53 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMemoryGuardAllocator()54 static bool UseCudaMemoryGuardAllocator() {
55 const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
56 return allocator_env != nullptr &&
57 std::strcmp(allocator_env, "memory_guard") == 0;
58 }
59
60 // NOLINTNEXTLINE(clang-diagnostic-unused-function)
UseCudaMallocAsyncAllocator()61 static bool UseCudaMallocAsyncAllocator() {
62 const char* allocator_env = std::getenv("TF_GPU_ALLOCATOR");
63 auto result = allocator_env != nullptr &&
64 std::strcmp(allocator_env, "cuda_malloc_async") == 0;
65 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
66 return result;
67 #else
68 if (result)
69 LOG(ERROR) << "TF_GPU_ALLOCATOR=cuda_malloc_async environment found, "
70 << "but TensorFlow was not compiled with CUDA 11.2+.";
71 return false;
72 #endif
73 }
74
singleton(GPUProcessState * ps)75 /*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
76 static GPUProcessState* instance = ps ? ps : new GPUProcessState;
77 DCHECK((!ps) || (ps == instance))
78 << "Multiple calls to GPUProcessState with non-null ps";
79 return instance;
80 }
81
GPUProcessState()82 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
83 process_state_ = ProcessState::singleton();
84 }
85
BusIdForGPU(TfDeviceId tf_device_id)86 int GPUProcessState::BusIdForGPU(TfDeviceId tf_device_id) {
87 // Return the NUMA node associated with the GPU's StreamExecutor.
88 se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
89 DEVICE_GPU, GPUMachineManager(), tf_device_id)
90 .ValueOrDie();
91 int numa_node = se->GetDeviceDescription().numa_node();
92 // bus_id must be non-negative. If the numa_node is not known,
93 // use 0.
94 return numa_node >= 0 ? numa_node : 0;
95 }
96
97 // NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
CreateSubAllocator(const GPUOptions & options,PlatformDeviceId platform_device_id,const std::vector<SubAllocator::Visitor> & alloc_visitors,size_t total_bytes,const std::vector<TfDeviceId> & peer_gpu_ids)98 static SubAllocator* CreateSubAllocator(
99 const GPUOptions& options, PlatformDeviceId platform_device_id,
100 const std::vector<SubAllocator::Visitor>& alloc_visitors,
101 size_t total_bytes, const std::vector<TfDeviceId>& peer_gpu_ids) {
102 auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
103 platform_device_id)
104 .ValueOrDie();
105
106 // FIXME(imintz): Observed OOM issues when using the virtual memory
107 // allocators. This should be reenabled when resolved.
108 #if 0 && defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
109 // Use the old allocator when unified memory is required.
110 // TODO(imintz): Remove the cuMemAlloc capability of this allocator.
111 if (options.per_process_gpu_memory_fraction() > 1.0 ||
112 options.experimental().use_unified_memory()) {
113 return new DeviceMemAllocator(executor, platform_device_id,
114 /*use_unified_memory=*/true, alloc_visitors,
115 {});
116 } else {
117 auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
118 executor->implementation()->GpuContextHack());
119
120 absl::flat_hash_set<PlatformDeviceId> platform_peer_gpu_ids;
121 platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
122 for (const TfDeviceId tf_device_id : peer_gpu_ids) {
123 PlatformDeviceId platform_device_id;
124 TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
125 platform_peer_gpu_ids.insert(platform_device_id);
126 }
127 std::vector<PlatformDeviceId> platform_peer_gpu_ids_vec(
128 platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
129
130 // Adjust virtual address space to be slightly larger than the physical
131 // address space in case the BFC allocator performs suboptimal garbage
132 // collection.
133 // TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
134 // the va space.
135 return GpuVirtualMemAllocator::Create(
136 alloc_visitors, {}, *gpu_context, platform_device_id,
137 /*virtual_address_space_size=*/total_bytes * 2,
138 platform_peer_gpu_ids_vec)
139 .ValueOrDie()
140 .release();
141 }
142 #else
143 return new DeviceMemAllocator(
144 executor, platform_device_id,
145 (options.per_process_gpu_memory_fraction() > 1.0 ||
146 options.experimental().use_unified_memory()),
147 alloc_visitors, {});
148 #endif
149 }
150
GetGPUAllocator(const GPUOptions & options,TfDeviceId tf_device_id,size_t total_bytes,const std::vector<TfDeviceId> & peer_gpu_ids)151 Allocator* GPUProcessState::GetGPUAllocator(
152 const GPUOptions& options, TfDeviceId tf_device_id, size_t total_bytes,
153 const std::vector<TfDeviceId>& peer_gpu_ids) {
154 CHECK(process_state_);
155 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
156 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
157 const string& allocator_type = options.allocator_type();
158 mutex_lock lock(mu_);
159 DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
160 tf_device_id);
161
162 if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
163 gpu_allocators_.resize(tf_device_id.value() + 1);
164 }
165
166 AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
167 if (allocator_parts.allocator == nullptr) {
168 // Validate allocator types.
169 if (!allocator_type.empty() && allocator_type != "BFC") {
170 LOG(ERROR) << "Invalid allocator type: " << allocator_type;
171 return nullptr;
172 }
173
174 PlatformDeviceId platform_device_id;
175 TF_CHECK_OK(
176 GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
177 int bus_id = BusIdForGPU(tf_device_id);
178 DCHECK_GE(bus_id, 0);
179 while (bus_id >= gpu_visitors_.size()) {
180 gpu_visitors_.push_back({});
181 }
182 auto* sub_allocator =
183 CreateSubAllocator(options, platform_device_id, gpu_visitors_[bus_id],
184 total_bytes, peer_gpu_ids);
185 GPUBFCAllocator* gpu_bfc_allocator = new GPUBFCAllocator(
186 sub_allocator, total_bytes, options,
187 strings::StrCat("GPU_", tf_device_id.value(), "_bfc"),
188 options.experimental().internal_fragmentation_fraction());
189 Allocator* gpu_allocator = gpu_bfc_allocator;
190
191 SharedCounter* timing_counter = nullptr;
192 if (options.experimental().timestamped_allocator()) {
193 timing_counter = new SharedCounter;
194 gpu_bfc_allocator->SetTimingCounter(timing_counter);
195 }
196
197 // If true, checks for memory overwrites by writing
198 // distinctive patterns on both ends of allocated memory.
199 if (UseCudaMemoryGuardAllocator()) {
200 LOG(INFO) << "Using memory guard allocator for GPU.";
201 gpu_allocator = new GPUDebugAllocator(gpu_allocator, platform_device_id);
202 gpu_allocator =
203 new GPUNanResetAllocator(gpu_allocator, platform_device_id);
204 } else if (UseCudaMallocAllocator()) {
205 LOG(INFO) << "Using CUDA malloc allocator for GPU.";
206 // If true, passes all allocation requests through to cudaMalloc
207 // useful for doing memory debugging with tools like cuda-memcheck
208 // **WARNING** probably will not work in a multi-gpu scenario
209 delete gpu_bfc_allocator;
210 delete sub_allocator;
211 gpu_bfc_allocator = nullptr;
212 sub_allocator = nullptr;
213 gpu_allocator = new GPUcudaMallocAllocator(platform_device_id);
214 } else if (UseCudaMallocAsyncAllocator()) {
215 LOG(INFO) << "Using CUDA malloc Async allocator for GPU: "
216 << platform_device_id;
217 // If true, passes all allocation requests through to cudaMallocAsync
218 // TODO: useful for doing memory debugging with tools like
219 // compute-sanitizer.
220 // TODO: **WARNING** probably will not work in a multi-gpu scenario
221 delete gpu_bfc_allocator;
222 delete sub_allocator;
223 gpu_bfc_allocator = nullptr;
224 sub_allocator = nullptr;
225 gpu_allocator =
226 new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
227 }
228
229 Allocator* recording_allocator = nullptr;
230 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
231 ProcessState::MemDesc md;
232 md.loc = ProcessState::MemDesc::GPU;
233 md.dev_index = platform_device_id.value();
234 md.gpu_registered = false;
235 md.nic_registered = true;
236 recording_allocator = new internal::RecordingAllocator(
237 &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
238 }
239 allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator),
240 std::unique_ptr<SharedCounter>(timing_counter),
241 gpu_bfc_allocator, sub_allocator,
242 std::unique_ptr<Allocator>(recording_allocator)};
243 }
244 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
245 return allocator_parts.recording_allocator.get();
246 } else {
247 return allocator_parts.allocator.get();
248 }
249 #else
250 LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda or "
251 "--config=rocm.";
252 return nullptr;
253 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
254 }
255
GPUAllocatorCounter(TfDeviceId tf_device_id)256 SharedCounter* GPUProcessState::GPUAllocatorCounter(TfDeviceId tf_device_id) {
257 DCHECK(process_state_);
258 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
259 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
260 DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
261 tf_device_id);
262 mutex_lock l(mu_);
263 if (tf_device_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
264 LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
265 << " but only have " << gpu_allocators_.size();
266 return nullptr;
267 }
268
269 AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
270 if (allocator_parts.counter.get() == nullptr) {
271 if (allocator_parts.bfc_allocator == nullptr) {
272 return nullptr;
273 }
274 SharedCounter* timing_counter = new SharedCounter;
275 allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
276 allocator_parts.counter.reset(timing_counter);
277 }
278 return allocator_parts.counter.get();
279 #else
280 return nullptr;
281 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
282 }
283
GetGpuHostAllocator(int numa_node)284 Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
285 CHECK(process_state_);
286 if (!HasGPUDevice() ||
287 !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
288 return process_state_->GetCPUAllocator(numa_node);
289 }
290 if (numa_node == port::kNUMANoAffinity) {
291 numa_node = 0;
292 }
293 {
294 // Here we optimize the most common use case where gpu_host_allocators_
295 // have already been populated and since we're only reading
296 // these vectors, we can get by with a shared lock. In the slower case,
297 // we take a unique lock and populate these vectors.
298 tf_shared_lock lock(mu_);
299
300 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types &&
301 !gpu_host_allocators_.empty() &&
302 gpu_host_allocators_[0].recording_allocator != nullptr) {
303 return gpu_host_allocators_[0].recording_allocator.get();
304 }
305 if (static_cast<int>(gpu_host_allocators_.size()) > numa_node) {
306 return gpu_host_allocators_[0].allocator.get();
307 }
308 }
309
310 mutex_lock lock(mu_);
311 // Find the first valid StreamExecutor to request CUDA or ROCm host memory
312 // through, since any will work.
313 //
314 // This search isn't super clean, and it would be nice to use a
315 // better source of information about which executor to use. For
316 // example, process_state could maybe save the first stream executor
317 // it knows is valid.
318 se::StreamExecutor* se = nullptr;
319 for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
320 if (gpu_allocators_[i].allocator != nullptr) {
321 se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
322 TfDeviceId(i))
323 .ValueOrDie();
324 break;
325 }
326 }
327
328 CHECK_NE(nullptr, se);
329
330 while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
331 while (gpu_host_alloc_visitors_.size() <= numa_node) {
332 gpu_host_alloc_visitors_.push_back({});
333 }
334 while (gpu_host_free_visitors_.size() <= numa_node) {
335 gpu_host_free_visitors_.push_back({});
336 }
337 SubAllocator* sub_allocator = new DeviceHostAllocator(
338 se, numa_node, gpu_host_alloc_visitors_[numa_node],
339 gpu_host_free_visitors_[numa_node]);
340 // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
341 int64_t gpu_host_mem_limit_in_mb = -1;
342 Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
343 1LL << 16 /*64GB max by default*/,
344 &gpu_host_mem_limit_in_mb);
345 if (!status.ok()) {
346 LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
347 }
348 int64_t gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
349
350 Allocator* allocator =
351 new BFCAllocator(sub_allocator, gpu_host_mem_limit,
352 /*allow_growth=*/true, /*name=*/"gpu_host_bfc");
353
354 if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
355 // Wrap the allocator to track allocation ids for better logging
356 // at the cost of performance.
357 allocator = new TrackingAllocator(allocator, true);
358 }
359 gpu_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
360 std::unique_ptr<SharedCounter>(nullptr),
361 nullptr, sub_allocator,
362 std::unique_ptr<Allocator>(nullptr)});
363 AllocatorParts& allocator_parts = gpu_host_allocators_.back();
364 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
365 ProcessState::MemDesc md;
366 md.loc = ProcessState::MemDesc::CPU;
367 md.dev_index = 0;
368 md.gpu_registered = true;
369 md.nic_registered = false;
370 allocator_parts.recording_allocator.reset(
371 new internal::RecordingAllocator(&process_state_->mem_desc_map_,
372 allocator_parts.allocator.get(), md,
373 &mu_));
374 }
375 }
376 if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
377 return gpu_host_allocators_[0].recording_allocator.get();
378 } else {
379 return gpu_host_allocators_[0].allocator.get();
380 }
381 }
382
AddGPUAllocVisitor(int bus_id,const SubAllocator::Visitor & visitor)383 void GPUProcessState::AddGPUAllocVisitor(int bus_id,
384 const SubAllocator::Visitor& visitor) {
385 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
386 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
387 mutex_lock lock(mu_);
388 CHECK(gpu_allocators_.empty()) // Crash OK
389 << "AddGPUAllocVisitor must be called before "
390 "first call to GetGPUAllocator.";
391 DCHECK_GE(bus_id, 0);
392 while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
393 gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
394 }
395 gpu_visitors_[bus_id].push_back(visitor);
396 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
397 }
398
AddGpuHostAllocVisitor(int numa_node,const SubAllocator::Visitor & visitor)399 void GPUProcessState::AddGpuHostAllocVisitor(
400 int numa_node, const SubAllocator::Visitor& visitor) {
401 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
402 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
403 mutex_lock lock(mu_);
404 CHECK(gpu_host_allocators_.empty()) // Crash OK
405 << "AddGpuHostAllocVisitor must be called before "
406 "first call to GetGpuHostAllocator.";
407 while (numa_node >= static_cast<int64>(gpu_host_alloc_visitors_.size())) {
408 gpu_host_alloc_visitors_.push_back(std::vector<SubAllocator::Visitor>());
409 }
410 gpu_host_alloc_visitors_[numa_node].push_back(visitor);
411 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
412 }
413
AddGpuHostFreeVisitor(int numa_node,const SubAllocator::Visitor & visitor)414 void GPUProcessState::AddGpuHostFreeVisitor(
415 int numa_node, const SubAllocator::Visitor& visitor) {
416 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
417 (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
418 mutex_lock lock(mu_);
419 CHECK(gpu_host_allocators_.empty()) // Crash OK
420 << "AddGpuHostFreeVisitor must be called before "
421 "first call to GetGpuHostAllocator.";
422 while (numa_node >= static_cast<int64>(gpu_host_free_visitors_.size())) {
423 gpu_host_free_visitors_.push_back(std::vector<SubAllocator::Visitor>());
424 }
425 gpu_host_free_visitors_[numa_node].push_back(visitor);
426 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
427 }
428
TestOnlyReset()429 void GPUProcessState::TestOnlyReset() {
430 if (process_state_) {
431 process_state_->ProcessState::TestOnlyReset();
432 }
433 {
434 mutex_lock lock(mu_);
435 gpu_device_enabled_ = false;
436 gpu_allocators_.clear();
437 gpu_visitors_.clear();
438 gpu_host_allocators_.clear();
439 gpu_host_alloc_visitors_.clear();
440 gpu_host_free_visitors_.clear();
441 }
442 }
443
444 } // namespace tensorflow
445