1 #include <ATen/cuda/CachingHostAllocator.h>
2
3 #include <ATen/DeviceGuard.h>
4 #include <ATen/cuda/CUDAEvent.h>
5 #include <ATen/cuda/detail/CUDAHooks.h>
6 #include <ATen/detail/CUDAHooksInterface.h>
7 #include <c10/core/thread_pool.h>
8 #include <c10/cuda/CUDAAllocatorConfig.h>
9
10 #include <cuda_runtime_api.h>
11 #include <future>
12
13 namespace at::cuda {
14 namespace {
15
16 // Note: cudaEventCreate when concurrently invoked from multiple threads can be
17 // very expensive (at least on certain device/driver combinations). Thus, we a)
18 // serialize event creation at a per-device level, and b) pool the events to
19 // avoid constantly calling cudaEventCreate/cudaEventDestroy. This results in
20 // significant improvements in multithreaded workloads with high allocation
21 // rates.
22 class EventPool {
23 public:
24 using Event = std::unique_ptr<
25 at::cuda::CUDAEvent,
26 std::function<void(at::cuda::CUDAEvent*)>>;
EventPool()27 EventPool() : pools_(at::cuda::device_count()) {}
28
get(DeviceIndex device)29 Event get(DeviceIndex device) {
30 TORCH_INTERNAL_ASSERT(0 <= device);
31 TORCH_INTERNAL_ASSERT(device < static_cast<DeviceIndex>(pools_.size()));
32 auto& pool = pools_[device];
33 auto destructor = [&pool](at::cuda::CUDAEvent* event) {
34 std::lock_guard<std::mutex> g(pool.mutex_);
35 pool.event_pool_.push_back(std::unique_ptr<at::cuda::CUDAEvent>(event));
36 };
37
38 // Try to acquire an event from the per-device pool.
39 {
40 std::lock_guard<std::mutex> g(pool.mutex_);
41 if (!pool.event_pool_.empty()) {
42 auto* event = pool.event_pool_.back().release();
43 pool.event_pool_.pop_back();
44 return Event(event, destructor);
45 }
46 }
47 // otherwise, allocate a new event that will be returned to the pool on
48 // destruction.
49 return Event(
50 std::make_unique<at::cuda::CUDAEvent>(cudaEventDisableTiming).release(),
51 destructor);
52 }
53
empty_cache()54 void empty_cache() {
55 for (auto& pool : pools_) {
56 std::lock_guard<std::mutex> g(pool.mutex_);
57 pool.event_pool_.clear();
58 }
59 }
60
61 private:
62 struct PerDevicePool {
63 alignas(64) std::mutex mutex_;
64 std::vector<std::unique_ptr<at::cuda::CUDAEvent>> event_pool_;
65 };
66 std::vector<PerDevicePool> pools_;
67 };
68
69 using Block = HostBlock<CUDAStream>;
70
71 struct CUDACachingHostAllocatorImpl
72 : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
73 private:
allocate_host_memoryat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl74 void allocate_host_memory(size_t size, void** ptr) override {
75 // Pinned memory pointers allocated by any device can be directly used by
76 // any other device, regardless of the current device at the time of
77 // allocation, since we assume unified addressing. So we grab any existing
78 // primary context, if available. See pytorch/pytorch#21081.
79 at::OptionalDeviceGuard device_guard;
80 auto primary_ctx_device_index =
81 c10::cuda::getDeviceIndexWithPrimaryContext();
82 if (primary_ctx_device_index.has_value()) {
83 device_guard.reset_device(
84 at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
85 }
86
87 if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
88 pinned_use_cuda_host_register()) {
89 allocWithCudaHostRegister(ptr, size);
90 } else {
91 // Use cudaHostAlloc for allocating pinned memory (global lock in driver)
92 C10_CUDA_CHECK(cudaHostAlloc(ptr, size, cudaHostAllocDefault));
93 }
94 }
95
free_blockat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl96 void free_block(Block* block) override {
97 if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
98 pinned_use_cuda_host_register()) {
99 void* ptr = block->ptr_;
100 AT_CUDA_CHECK(cudaHostUnregister(ptr));
101 std::free(ptr);
102 } else {
103 AT_CUDA_CHECK(cudaFreeHost(block->ptr_));
104 }
105 }
106
record_streamat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl107 void record_stream(
108 std::optional<std::vector<EventPool::Event>>& events,
109 CUDAStream stream) override {
110 auto event = create_event_internal(stream.device_index());
111 event->record(stream);
112 events->push_back(std::move(event));
113 }
114
query_eventat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl115 bool query_event(EventPool::Event& event) override {
116 cudaError_t err = cudaEventQuery(*event);
117 if (err == cudaErrorNotReady) {
118 (void)cudaGetLastError(); // clear CUDA error
119 return false;
120 } else if (err != cudaSuccess) {
121 C10_CUDA_CHECK(err);
122 }
123 return true;
124 }
125
create_event_internalat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl126 EventPool::Event create_event_internal(DeviceIndex idx) {
127 // Leak the event pool to avoid shutdown issue.
128 static auto* event_pool = new EventPool();
129 return event_pool->get(idx);
130 }
131
getThreadPoolat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl132 TaskThreadPool* getThreadPool() {
133 static TaskThreadPool* pool = new TaskThreadPool(
134 c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
135 pinned_max_register_threads());
136 return pool;
137 }
138
mapPagesForRegisterat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl139 void mapPagesForRegister(
140 const void* ptr,
141 size_t size,
142 size_t i,
143 size_t numThreads,
144 size_t pageSize) {
145 uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
146 uintptr_t end = (uintptr_t)start + (size / numThreads);
147 if (i == (numThreads - 1)) {
148 end = (uintptr_t)ptr + size;
149 }
150
151 // pre-fault/map the pages by setting the first byte of the page
152 uintptr_t alignedStart =
153 (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
154 for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
155 memset((void*)p, 0, 1);
156 }
157 }
158
registerPagesat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl159 void registerPages(const void* ptr, size_t size) {
160 AT_CUDA_CHECK(
161 cudaHostRegister((void*)ptr, (size_t)size, cudaHostRegisterDefault));
162
163 // If host and device pointer don't match, give a warning and exit
164 void* devptr = nullptr;
165 AT_CUDA_CHECK(cudaHostGetDevicePointer(&devptr, (void*)ptr, 0));
166 TORCH_CHECK(
167 (void*)devptr == (void*)ptr,
168 "Host and device pointer dont match with cudaHostRegister. "
169 "Please dont use this feature by setting "
170 "PYTORCH_CUDA_ALLOC_CONF=use_cuda_host_register:False (default)",
171 "");
172 }
173
allocWithCudaHostRegisterat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl174 void allocWithCudaHostRegister(void** ptr, size_t roundSize) {
175 // Here we do regular allocation, pre-fault/map the pages, and then do
176 // cudaHostRegister with GPU mapping flags to lock the pages, so we
177 // can minimize the cost for the cuda global lock.
178 *ptr = std::malloc(roundSize);
179
180 // Parallelize the mapping/registering of pages to reduce wall time
181 size_t pageSize = (1 << 12); // 4kB pages
182 size_t numMapThreads = c10::cuda::CUDACachingAllocator::
183 CUDAAllocatorConfig::pinned_num_register_threads();
184 if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) {
185 // parallelize the mapping of pages with a threadpool
186 auto* pool = getThreadPool();
187 std::vector<std::promise<void>> promises;
188 std::vector<std::future<void>> futures;
189 promises.reserve(numMapThreads);
190 futures.reserve(numMapThreads);
191
192 for (size_t i = 0; i < numMapThreads; i++) {
193 promises.emplace_back();
194 futures.push_back(promises[i].get_future());
195 auto task = [this,
196 i,
197 ptr,
198 roundSize,
199 numMapThreads,
200 pageSize,
201 &promises]() mutable {
202 mapPagesForRegister(
203 *ptr,
204 roundSize,
205 i, // thread task-id
206 numMapThreads,
207 pageSize);
208 // set the promise when mapping pages are done
209 promises[i].set_value();
210 };
211 pool->run(task);
212 }
213 for (auto& future : futures) {
214 future.wait();
215 }
216 } else {
217 // Map pages in the same thread
218 mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize);
219 }
220
221 // Register the mapped pages using cudaHostRegister
222 registerPages(*ptr, roundSize);
223 }
224 };
225
226 void raw_local_deleter(void* ptr);
227
228 struct CUDACachingHostAllocator final
229 : public CachingHostAllocatorInterface<CUDACachingHostAllocatorImpl> {
allocateat::cuda::__anondde4d04d0111::CUDACachingHostAllocator230 at::DataPtr allocate(size_t size) override {
231 auto ptr_and_ctx = impl_->allocate(size);
232 return {
233 ptr_and_ctx.first,
234 ptr_and_ctx.second,
235 &raw_local_deleter,
236 at::DeviceType::CPU};
237 }
238 };
239
240 CUDACachingHostAllocator caching_host_allocator;
241
getCUDACachingHostAllocator()242 static inline CUDACachingHostAllocator& getCUDACachingHostAllocator() {
243 return caching_host_allocator;
244 }
245
raw_local_deleter(void * ptr)246 void raw_local_deleter(void* ptr) {
247 getCUDACachingHostAllocator().free(ptr);
248 }
249
250 } // anonymous namespace
251
CachingHostAllocator_recordEvent(void * ptr,void * ctx,at::cuda::CUDAStream stream)252 bool CachingHostAllocator_recordEvent(
253 void* ptr,
254 void* ctx,
255 at::cuda::CUDAStream stream) {
256 return getCUDACachingHostAllocator().record_event(ptr, ctx, stream);
257 }
258
259 // Releases cached pinned memory allocations via cudaHostFree
CachingHostAllocator_emptyCache()260 void CachingHostAllocator_emptyCache() {
261 getCUDACachingHostAllocator().empty_cache();
262 }
263
getCachingHostAllocator()264 at::Allocator* getCachingHostAllocator() {
265 return &getCUDACachingHostAllocator();
266 }
267
268 } // namespace at::cuda
269