1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // CUDA virtual memory API is only available in CUDA versions greater than 10.2. 17 18 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_VMEM_ALLOCATOR_H_ 19 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_VMEM_ALLOCATOR_H_ 20 21 #include "tensorflow/core/common_runtime/gpu/gpu_id.h" 22 #include "tensorflow/core/framework/allocator.h" 23 #include "tensorflow/core/platform/stream_executor.h" 24 #include "tensorflow/stream_executor/lib/statusor.h" 25 26 #if GOOGLE_CUDA 27 #include "tensorflow/stream_executor/gpu/gpu_driver.h" 28 #include "tensorflow/stream_executor/gpu/gpu_types.h" 29 #endif 30 31 #if CUDA_VERSION >= 10020 32 33 namespace tensorflow { 34 35 // GpuVirtualMemAllocator is a SubAllocator for use with BFCAllocator which 36 // provides contiguous allocations with each call to Alloc. This is done by 37 // reserving a large chunk of virtual addresses at construction and then mapping 38 // physical memory pages to this virtual address range as requested. 39 // 40 // This class is not thread-safe. 41 class GpuVirtualMemAllocator : public SubAllocator { 42 public: 43 static stream_executor::port::StatusOr< 44 std::unique_ptr<GpuVirtualMemAllocator>> 45 Create(const std::vector<Visitor>& alloc_visitors, 46 const std::vector<Visitor>& free_visitors, 47 stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id, 48 size_t virtual_address_space_size, 49 const std::vector<PlatformGpuId>& peer_gpu_ids); 50 ~GpuVirtualMemAllocator() override; 51 52 // Allocates memory at least as large as requested by num_bytes. Will be 53 // aligned to the min allocation granularity (typically 2MiB). 54 // alignment is ignored by this allocator. 55 void* Alloc(size_t alignment, size_t num_bytes, 56 size_t* bytes_received) override; 57 58 // Frees should only happen at the end of the contiguous memory allocations or 59 // else we introduce pointless fragmentation...But, this is supported. If the 60 // allocation happens at the end, then the next_alloc_offset_ is moved back, 61 // otherwise a hole is created. 62 // 63 // Holes are not re-used, all allocations continue to come at the end of the 64 // next_alloc_offset_. To accommodate this, the virtual_address_space_size 65 // should be much larger than the max physical size of the allocator. 66 // 67 // In practice, since the BFC allocator coalesces adjacent AllocationRegions, 68 // this free function should never be invoked. 69 void Free(void* ptr, size_t num_bytes) override; 70 SupportsCoalescing()71 bool SupportsCoalescing() const override { return true; } 72 73 private: 74 GpuVirtualMemAllocator( 75 const std::vector<Visitor>& alloc_visitors, 76 const std::vector<Visitor>& free_visitors, 77 stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id, 78 std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles, 79 stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity); 80 81 stream_executor::gpu::GpuContext& gpu_context_; 82 PlatformGpuId gpu_id_; 83 84 // Peer access is configured at mmap time so the allocator must be aware of 85 // all gpus that may want to read the memory. This list also includes the 86 // above gpu_id_ to facilitate the invocation of the GpuDriver::MapMemory 87 // function. 88 const std::vector<stream_executor::gpu::GpuDeviceHandle> access_gpu_handles_; 89 90 // The virtual memory span held by this allocator. 91 stream_executor::gpu::GpuDriver::VmemSpan vmem_; 92 // The next offset from the vmem base address that will be allocated. This 93 // corresponds to the size of physically pinned memory if holes haven't been 94 // created with "free". 95 size_t next_alloc_offset_ = 0; 96 97 // Smallest allocation as determined by CUDA. 98 const size_t granularity_; 99 100 struct Mapping { 101 stream_executor::gpu::GpuDevicePtr va; 102 stream_executor::gpu::GpuDriver::GenericMemoryHandle physical; 103 }; 104 // List of mappings, sorted by va. 105 std::vector<Mapping> mappings_; 106 107 TF_DISALLOW_COPY_AND_ASSIGN(GpuVirtualMemAllocator); 108 }; 109 110 } // namespace tensorflow 111 112 #endif // CUDA_VERSION >= 10200 113 114 #endif // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_VMEM_ALLOCATOR_H_ 115