• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
17 
18 #include "absl/strings/str_format.h"
19 #include "tensorflow/core/lib/strings/numbers.h"
20 #include "tensorflow/stream_executor/lib/status.h"
21 
22 #if CUDA_VERSION >= 10020
23 
24 namespace tensorflow {
25 namespace {
26 
27 using ::stream_executor::gpu::GpuContext;
28 using ::stream_executor::gpu::GpuDeviceHandle;
29 using ::stream_executor::gpu::GpuDevicePtr;
30 using ::stream_executor::gpu::GpuDriver;
31 using ::stream_executor::port::Status;
32 using ::stream_executor::port::StatusOr;
33 
34 // Rounds value up to the specified power of two alignment.
AlignUp(size_t value,size_t alignment)35 size_t AlignUp(size_t value, size_t alignment) {
36   DCHECK_EQ(alignment & (alignment - 1), 0)
37       << "Alignment must be a power of two; alignment=" << alignment;
38   return (value + alignment - 1) & ~(alignment - 1);
39 }
40 
SupportsVirtualAddressManagement(GpuDeviceHandle device)41 StatusOr<bool> SupportsVirtualAddressManagement(GpuDeviceHandle device) {
42   return GpuDriver::GetDeviceAttribute(
43       CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, device);
44 }
45 
CheckVirtualAddressManagementSupport(GpuDeviceHandle device,PlatformDeviceId gpu_id)46 Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
47                                             PlatformDeviceId gpu_id) {
48   TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
49                       SupportsVirtualAddressManagement(device));
50   if (!supports_virtual_address_management) {
51     return stream_executor::port::InternalError(absl::StrFormat(
52         "GPU %d does not support virtual memory address management.",
53         gpu_id.value()));
54   }
55   return {};
56 }
57 
58 }  // namespace
59 
60 /* static */ stream_executor::port::StatusOr<
61     std::unique_ptr<GpuVirtualMemAllocator>>
Create(const std::vector<Visitor> & alloc_visitors,const std::vector<Visitor> & free_visitors,GpuContext & gpu_context,PlatformDeviceId gpu_id,size_t virtual_address_space_size,const std::vector<PlatformDeviceId> & peer_gpu_ids)62 GpuVirtualMemAllocator::Create(
63     const std::vector<Visitor>& alloc_visitors,
64     const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
65     PlatformDeviceId gpu_id, size_t virtual_address_space_size,
66     const std::vector<PlatformDeviceId>& peer_gpu_ids) {
67   std::vector<GpuDeviceHandle> access_gpu_handles;
68   access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
69 
70   GpuDeviceHandle gpu_handle;
71   TF_RETURN_IF_ERROR(GpuDriver::GetDevice(gpu_id.value(), &gpu_handle));
72   TF_RETURN_IF_ERROR(CheckVirtualAddressManagementSupport(gpu_handle, gpu_id));
73 
74   access_gpu_handles.push_back(gpu_handle);
75   for (const auto& peer_id : peer_gpu_ids) {
76     GpuDeviceHandle peer_handle;
77     TF_RETURN_IF_ERROR(GpuDriver::GetDevice(peer_id.value(), &peer_handle));
78     TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
79                         SupportsVirtualAddressManagement(peer_handle));
80     if (GpuDriver::CanEnablePeerAccess(gpu_handle, peer_handle) &&
81         supports_virtual_address_management) {
82       access_gpu_handles.push_back(peer_handle);
83     }
84   }
85 
86   // Find the min granularity for all devices that have access to this memory;
87   // that is, the maximum min granularity among all devices.
88   size_t max_granularity = 1;
89   for (const auto device_handle : access_gpu_handles) {
90     TF_ASSIGN_OR_RETURN(size_t granularity,
91                         GpuDriver::GetMinAllocationGranularity(device_handle));
92     max_granularity = std::max(max_granularity, granularity);
93   }
94 
95   // Create the virtual memory reservation. Must be aligned to system page size,
96   // and larger than the CUDA min granularity. Empirically, the granularity
97   // check is sufficient as the granularity is some multiple of the page size.
98   // TODO(imintz): Create OS agnostic page size utility for completeness.
99   TF_ASSIGN_OR_RETURN(
100       GpuDriver::VmemSpan vmem,
101       GpuDriver::ReserveVirtualMemory(
102           &gpu_context, AlignUp(virtual_address_space_size, max_granularity)));
103   VLOG(1) << "Reserved GPU virtual memory at " << vmem.base << " of size "
104           << strings::HumanReadableNumBytes(vmem.size_bytes) << " bytes";
105 
106   return std::unique_ptr<GpuVirtualMemAllocator>(new GpuVirtualMemAllocator(
107       alloc_visitors, free_visitors, gpu_context, gpu_id,
108       std::move(access_gpu_handles), vmem, max_granularity));
109 }
110 
GpuVirtualMemAllocator(const std::vector<Visitor> & alloc_visitors,const std::vector<Visitor> & free_visitors,GpuContext & gpu_context,PlatformDeviceId gpu_id,const std::vector<GpuDeviceHandle> access_gpu_handles,GpuDriver::VmemSpan vmem,size_t granularity)111 GpuVirtualMemAllocator::GpuVirtualMemAllocator(
112     const std::vector<Visitor>& alloc_visitors,
113     const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
114     PlatformDeviceId gpu_id,
115     const std::vector<GpuDeviceHandle> access_gpu_handles,
116     GpuDriver::VmemSpan vmem, size_t granularity)
117     : SubAllocator(alloc_visitors, free_visitors),
118       gpu_context_(gpu_context),
119       gpu_id_(gpu_id),
120       access_gpu_handles_(access_gpu_handles),
121       vmem_(vmem),
122       granularity_(granularity) {}
123 
~GpuVirtualMemAllocator()124 GpuVirtualMemAllocator::~GpuVirtualMemAllocator() {
125   for (const auto mapping : mappings_) {
126     GpuDriver::UnmapMemory(&gpu_context_, mapping.va, mapping.physical.bytes);
127     GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(mapping.physical));
128   }
129   GpuDriver::FreeVirtualMemory(&gpu_context_, vmem_);
130 }
131 
Alloc(size_t alignment,size_t num_bytes,size_t * bytes_received)132 void* GpuVirtualMemAllocator::Alloc(size_t alignment, size_t num_bytes,
133                                     size_t* bytes_received) {
134   if (num_bytes == 0) return nullptr;
135   size_t padded_bytes = (num_bytes + granularity_ - 1) & ~(granularity_ - 1);
136 
137   GpuDevicePtr next_va = vmem_.base + next_alloc_offset_;
138 
139   // TODO(imintz): Attempt to extend the vmem allocation by reserving additional
140   // virtual memory at the specific address at the end of the initial vmem
141   // reservation.
142   if (next_va + padded_bytes > vmem_.base + vmem_.size_bytes) {
143     LOG(ERROR) << "OOM in GPU virtual memory allocator when attempting to "
144                   "allocate {request: "
145                << strings::HumanReadableNumBytes(num_bytes)
146                << ", aligned: " << padded_bytes << "} bytes.";
147     return nullptr;
148   }
149 
150   // Create physical memory backing allocation.
151   auto maybe_handle =
152       GpuDriver::CreateMemoryHandle(&gpu_context_, padded_bytes);
153   if (!maybe_handle.ok()) {
154     LOG(ERROR) << maybe_handle.status();
155     return nullptr;
156   }
157   GpuDriver::GenericMemoryHandle handle = std::move(maybe_handle).ValueOrDie();
158 
159   // Map VAs for this physical memory.
160   auto status =
161       GpuDriver::MapMemory(&gpu_context_, next_va, handle, access_gpu_handles_);
162   if (!status.ok()) {
163     LOG(ERROR) << status;
164     GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(handle));
165     return nullptr;
166   }
167   next_alloc_offset_ += handle.bytes;
168   mappings_.push_back({next_va, std::move(handle)});
169   VisitAlloc(reinterpret_cast<void*>(next_va), gpu_id_.value(), padded_bytes);
170   *bytes_received = padded_bytes;
171   return reinterpret_cast<void*>(next_va);
172 }
173 
Free(void * ptr,size_t num_bytes)174 void GpuVirtualMemAllocator::Free(void* ptr, size_t num_bytes) {
175   if (ptr == nullptr) return;
176 
177   auto mapping_it =
178       std::lower_bound(mappings_.begin(), mappings_.end(), ptr,
179                        [](const Mapping& mapping, const void* ptr) {
180                          return reinterpret_cast<const void*>(mapping.va) < ptr;
181                        });
182   if (mapping_it == mappings_.end() ||
183       (reinterpret_cast<void*>(mapping_it->va) != ptr)) {
184     LOG(ERROR) << "Could not find GPU vmem mapping for address at "
185                << reinterpret_cast<uintptr_t>(ptr);
186     return;
187   }
188 
189   int num_mappings_to_free = 0;
190   int total_bytes = 0;
191   for (auto it = mapping_it; it != mappings_.end() && total_bytes < num_bytes;
192        ++it) {
193     ++num_mappings_to_free;
194     total_bytes += it->physical.bytes;
195   }
196   if (total_bytes != num_bytes) {
197     LOG(ERROR) << "Invalid size requested for freeing GPU vmem mapping. Got "
198                << strings::HumanReadableNumBytes(num_bytes) << " but expected "
199                << strings::HumanReadableNumBytes(mapping_it->physical.bytes);
200     return;
201   }
202 
203   VLOG(1) << "Freeing " << num_mappings_to_free << " mappings for a total of "
204           << total_bytes << " bytes";
205   for (auto it = mapping_it; it < mapping_it + num_mappings_to_free; ++it) {
206     GpuDriver::UnmapMemory(&gpu_context_, it->va, it->physical.bytes);
207     GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(it->physical));
208   }
209 
210   // Move back the next_alloc_offset_ if this free was at the end.
211   if (mapping_it + num_mappings_to_free == mappings_.end()) {
212     next_alloc_offset_ = mapping_it->va - vmem_.base;
213   }
214 
215   mappings_.erase(mapping_it, mapping_it + num_mappings_to_free);
216   VisitFree(ptr, gpu_id_.value(), num_bytes);
217 }
218 
219 }  // namespace tensorflow
220 
221 #endif
222