1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
17
18 #include "absl/strings/str_format.h"
19 #include "tensorflow/core/lib/strings/numbers.h"
20 #include "tensorflow/stream_executor/lib/status.h"
21
22 #if CUDA_VERSION >= 10020
23
24 namespace tensorflow {
25 namespace {
26
27 using ::stream_executor::gpu::GpuContext;
28 using ::stream_executor::gpu::GpuDeviceHandle;
29 using ::stream_executor::gpu::GpuDevicePtr;
30 using ::stream_executor::gpu::GpuDriver;
31 using ::stream_executor::port::Status;
32 using ::stream_executor::port::StatusOr;
33
34 // Rounds value up to the specified power of two alignment.
AlignUp(size_t value,size_t alignment)35 size_t AlignUp(size_t value, size_t alignment) {
36 DCHECK_EQ(alignment & (alignment - 1), 0)
37 << "Alignment must be a power of two; alignment=" << alignment;
38 return (value + alignment - 1) & ~(alignment - 1);
39 }
40
SupportsVirtualAddressManagement(GpuDeviceHandle device)41 StatusOr<bool> SupportsVirtualAddressManagement(GpuDeviceHandle device) {
42 return GpuDriver::GetDeviceAttribute(
43 CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, device);
44 }
45
CheckVirtualAddressManagementSupport(GpuDeviceHandle device,PlatformDeviceId gpu_id)46 Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
47 PlatformDeviceId gpu_id) {
48 TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
49 SupportsVirtualAddressManagement(device));
50 if (!supports_virtual_address_management) {
51 return stream_executor::port::InternalError(absl::StrFormat(
52 "GPU %d does not support virtual memory address management.",
53 gpu_id.value()));
54 }
55 return {};
56 }
57
58 } // namespace
59
60 /* static */ stream_executor::port::StatusOr<
61 std::unique_ptr<GpuVirtualMemAllocator>>
Create(const std::vector<Visitor> & alloc_visitors,const std::vector<Visitor> & free_visitors,GpuContext & gpu_context,PlatformDeviceId gpu_id,size_t virtual_address_space_size,const std::vector<PlatformDeviceId> & peer_gpu_ids)62 GpuVirtualMemAllocator::Create(
63 const std::vector<Visitor>& alloc_visitors,
64 const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
65 PlatformDeviceId gpu_id, size_t virtual_address_space_size,
66 const std::vector<PlatformDeviceId>& peer_gpu_ids) {
67 std::vector<GpuDeviceHandle> access_gpu_handles;
68 access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
69
70 GpuDeviceHandle gpu_handle;
71 TF_RETURN_IF_ERROR(GpuDriver::GetDevice(gpu_id.value(), &gpu_handle));
72 TF_RETURN_IF_ERROR(CheckVirtualAddressManagementSupport(gpu_handle, gpu_id));
73
74 access_gpu_handles.push_back(gpu_handle);
75 for (const auto& peer_id : peer_gpu_ids) {
76 GpuDeviceHandle peer_handle;
77 TF_RETURN_IF_ERROR(GpuDriver::GetDevice(peer_id.value(), &peer_handle));
78 TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
79 SupportsVirtualAddressManagement(peer_handle));
80 if (GpuDriver::CanEnablePeerAccess(gpu_handle, peer_handle) &&
81 supports_virtual_address_management) {
82 access_gpu_handles.push_back(peer_handle);
83 }
84 }
85
86 // Find the min granularity for all devices that have access to this memory;
87 // that is, the maximum min granularity among all devices.
88 size_t max_granularity = 1;
89 for (const auto device_handle : access_gpu_handles) {
90 TF_ASSIGN_OR_RETURN(size_t granularity,
91 GpuDriver::GetMinAllocationGranularity(device_handle));
92 max_granularity = std::max(max_granularity, granularity);
93 }
94
95 // Create the virtual memory reservation. Must be aligned to system page size,
96 // and larger than the CUDA min granularity. Empirically, the granularity
97 // check is sufficient as the granularity is some multiple of the page size.
98 // TODO(imintz): Create OS agnostic page size utility for completeness.
99 TF_ASSIGN_OR_RETURN(
100 GpuDriver::VmemSpan vmem,
101 GpuDriver::ReserveVirtualMemory(
102 &gpu_context, AlignUp(virtual_address_space_size, max_granularity)));
103 VLOG(1) << "Reserved GPU virtual memory at " << vmem.base << " of size "
104 << strings::HumanReadableNumBytes(vmem.size_bytes) << " bytes";
105
106 return std::unique_ptr<GpuVirtualMemAllocator>(new GpuVirtualMemAllocator(
107 alloc_visitors, free_visitors, gpu_context, gpu_id,
108 std::move(access_gpu_handles), vmem, max_granularity));
109 }
110
GpuVirtualMemAllocator(const std::vector<Visitor> & alloc_visitors,const std::vector<Visitor> & free_visitors,GpuContext & gpu_context,PlatformDeviceId gpu_id,const std::vector<GpuDeviceHandle> access_gpu_handles,GpuDriver::VmemSpan vmem,size_t granularity)111 GpuVirtualMemAllocator::GpuVirtualMemAllocator(
112 const std::vector<Visitor>& alloc_visitors,
113 const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
114 PlatformDeviceId gpu_id,
115 const std::vector<GpuDeviceHandle> access_gpu_handles,
116 GpuDriver::VmemSpan vmem, size_t granularity)
117 : SubAllocator(alloc_visitors, free_visitors),
118 gpu_context_(gpu_context),
119 gpu_id_(gpu_id),
120 access_gpu_handles_(access_gpu_handles),
121 vmem_(vmem),
122 granularity_(granularity) {}
123
~GpuVirtualMemAllocator()124 GpuVirtualMemAllocator::~GpuVirtualMemAllocator() {
125 for (const auto mapping : mappings_) {
126 GpuDriver::UnmapMemory(&gpu_context_, mapping.va, mapping.physical.bytes);
127 GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(mapping.physical));
128 }
129 GpuDriver::FreeVirtualMemory(&gpu_context_, vmem_);
130 }
131
Alloc(size_t alignment,size_t num_bytes,size_t * bytes_received)132 void* GpuVirtualMemAllocator::Alloc(size_t alignment, size_t num_bytes,
133 size_t* bytes_received) {
134 if (num_bytes == 0) return nullptr;
135 size_t padded_bytes = (num_bytes + granularity_ - 1) & ~(granularity_ - 1);
136
137 GpuDevicePtr next_va = vmem_.base + next_alloc_offset_;
138
139 // TODO(imintz): Attempt to extend the vmem allocation by reserving additional
140 // virtual memory at the specific address at the end of the initial vmem
141 // reservation.
142 if (next_va + padded_bytes > vmem_.base + vmem_.size_bytes) {
143 LOG(ERROR) << "OOM in GPU virtual memory allocator when attempting to "
144 "allocate {request: "
145 << strings::HumanReadableNumBytes(num_bytes)
146 << ", aligned: " << padded_bytes << "} bytes.";
147 return nullptr;
148 }
149
150 // Create physical memory backing allocation.
151 auto maybe_handle =
152 GpuDriver::CreateMemoryHandle(&gpu_context_, padded_bytes);
153 if (!maybe_handle.ok()) {
154 LOG(ERROR) << maybe_handle.status();
155 return nullptr;
156 }
157 GpuDriver::GenericMemoryHandle handle = std::move(maybe_handle).ValueOrDie();
158
159 // Map VAs for this physical memory.
160 auto status =
161 GpuDriver::MapMemory(&gpu_context_, next_va, handle, access_gpu_handles_);
162 if (!status.ok()) {
163 LOG(ERROR) << status;
164 GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(handle));
165 return nullptr;
166 }
167 next_alloc_offset_ += handle.bytes;
168 mappings_.push_back({next_va, std::move(handle)});
169 VisitAlloc(reinterpret_cast<void*>(next_va), gpu_id_.value(), padded_bytes);
170 *bytes_received = padded_bytes;
171 return reinterpret_cast<void*>(next_va);
172 }
173
Free(void * ptr,size_t num_bytes)174 void GpuVirtualMemAllocator::Free(void* ptr, size_t num_bytes) {
175 if (ptr == nullptr) return;
176
177 auto mapping_it =
178 std::lower_bound(mappings_.begin(), mappings_.end(), ptr,
179 [](const Mapping& mapping, const void* ptr) {
180 return reinterpret_cast<const void*>(mapping.va) < ptr;
181 });
182 if (mapping_it == mappings_.end() ||
183 (reinterpret_cast<void*>(mapping_it->va) != ptr)) {
184 LOG(ERROR) << "Could not find GPU vmem mapping for address at "
185 << reinterpret_cast<uintptr_t>(ptr);
186 return;
187 }
188
189 int num_mappings_to_free = 0;
190 int total_bytes = 0;
191 for (auto it = mapping_it; it != mappings_.end() && total_bytes < num_bytes;
192 ++it) {
193 ++num_mappings_to_free;
194 total_bytes += it->physical.bytes;
195 }
196 if (total_bytes != num_bytes) {
197 LOG(ERROR) << "Invalid size requested for freeing GPU vmem mapping. Got "
198 << strings::HumanReadableNumBytes(num_bytes) << " but expected "
199 << strings::HumanReadableNumBytes(mapping_it->physical.bytes);
200 return;
201 }
202
203 VLOG(1) << "Freeing " << num_mappings_to_free << " mappings for a total of "
204 << total_bytes << " bytes";
205 for (auto it = mapping_it; it < mapping_it + num_mappings_to_free; ++it) {
206 GpuDriver::UnmapMemory(&gpu_context_, it->va, it->physical.bytes);
207 GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(it->physical));
208 }
209
210 // Move back the next_alloc_offset_ if this free was at the end.
211 if (mapping_it + num_mappings_to_free == mappings_.end()) {
212 next_alloc_offset_ = mapping_it->va - vmem_.base;
213 }
214
215 mappings_.erase(mapping_it, mapping_it + num_mappings_to_free);
216 VisitFree(ptr, gpu_id_.value(), num_bytes);
217 }
218
219 } // namespace tensorflow
220
221 #endif
222