• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <ATen/cuda/CachingHostAllocator.h>
2 
3 #include <ATen/DeviceGuard.h>
4 #include <ATen/cuda/CUDAEvent.h>
5 #include <ATen/cuda/detail/CUDAHooks.h>
6 #include <ATen/detail/CUDAHooksInterface.h>
7 #include <c10/core/thread_pool.h>
8 #include <c10/cuda/CUDAAllocatorConfig.h>
9 
10 #include <cuda_runtime_api.h>
11 #include <future>
12 
13 namespace at::cuda {
14 namespace {
15 
16 // Note: cudaEventCreate when concurrently invoked from multiple threads can be
17 // very expensive (at least on certain device/driver combinations). Thus, we a)
18 // serialize event creation at a per-device level, and b) pool the events to
19 // avoid constantly calling cudaEventCreate/cudaEventDestroy. This results in
20 // significant improvements in multithreaded workloads with high allocation
21 // rates.
22 class EventPool {
23  public:
24   using Event = std::unique_ptr<
25       at::cuda::CUDAEvent,
26       std::function<void(at::cuda::CUDAEvent*)>>;
EventPool()27   EventPool() : pools_(at::cuda::device_count()) {}
28 
get(DeviceIndex device)29   Event get(DeviceIndex device) {
30     TORCH_INTERNAL_ASSERT(0 <= device);
31     TORCH_INTERNAL_ASSERT(device < static_cast<DeviceIndex>(pools_.size()));
32     auto& pool = pools_[device];
33     auto destructor = [&pool](at::cuda::CUDAEvent* event) {
34       std::lock_guard<std::mutex> g(pool.mutex_);
35       pool.event_pool_.push_back(std::unique_ptr<at::cuda::CUDAEvent>(event));
36     };
37 
38     // Try to acquire an event from the per-device pool.
39     {
40       std::lock_guard<std::mutex> g(pool.mutex_);
41       if (!pool.event_pool_.empty()) {
42         auto* event = pool.event_pool_.back().release();
43         pool.event_pool_.pop_back();
44         return Event(event, destructor);
45       }
46     }
47     // otherwise, allocate a new event that will be returned to the pool on
48     // destruction.
49     return Event(
50         std::make_unique<at::cuda::CUDAEvent>(cudaEventDisableTiming).release(),
51         destructor);
52   }
53 
empty_cache()54   void empty_cache() {
55     for (auto& pool : pools_) {
56       std::lock_guard<std::mutex> g(pool.mutex_);
57       pool.event_pool_.clear();
58     }
59   }
60 
61  private:
62   struct PerDevicePool {
63     alignas(64) std::mutex mutex_;
64     std::vector<std::unique_ptr<at::cuda::CUDAEvent>> event_pool_;
65   };
66   std::vector<PerDevicePool> pools_;
67 };
68 
69 using Block = HostBlock<CUDAStream>;
70 
71 struct CUDACachingHostAllocatorImpl
72     : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
73  private:
allocate_host_memoryat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl74   void allocate_host_memory(size_t size, void** ptr) override {
75     // Pinned memory pointers allocated by any device can be directly used by
76     // any other device, regardless of the current device at the time of
77     // allocation, since we assume unified addressing. So we grab any existing
78     // primary context, if available. See pytorch/pytorch#21081.
79     at::OptionalDeviceGuard device_guard;
80     auto primary_ctx_device_index =
81         c10::cuda::getDeviceIndexWithPrimaryContext();
82     if (primary_ctx_device_index.has_value()) {
83       device_guard.reset_device(
84           at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
85     }
86 
87     if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
88             pinned_use_cuda_host_register()) {
89       allocWithCudaHostRegister(ptr, size);
90     } else {
91       // Use cudaHostAlloc for allocating pinned memory (global lock in driver)
92       C10_CUDA_CHECK(cudaHostAlloc(ptr, size, cudaHostAllocDefault));
93     }
94   }
95 
free_blockat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl96   void free_block(Block* block) override {
97     if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
98             pinned_use_cuda_host_register()) {
99       void* ptr = block->ptr_;
100       AT_CUDA_CHECK(cudaHostUnregister(ptr));
101       std::free(ptr);
102     } else {
103       AT_CUDA_CHECK(cudaFreeHost(block->ptr_));
104     }
105   }
106 
record_streamat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl107   void record_stream(
108       std::optional<std::vector<EventPool::Event>>& events,
109       CUDAStream stream) override {
110     auto event = create_event_internal(stream.device_index());
111     event->record(stream);
112     events->push_back(std::move(event));
113   }
114 
query_eventat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl115   bool query_event(EventPool::Event& event) override {
116     cudaError_t err = cudaEventQuery(*event);
117     if (err == cudaErrorNotReady) {
118       (void)cudaGetLastError(); // clear CUDA error
119       return false;
120     } else if (err != cudaSuccess) {
121       C10_CUDA_CHECK(err);
122     }
123     return true;
124   }
125 
create_event_internalat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl126   EventPool::Event create_event_internal(DeviceIndex idx) {
127     // Leak the event pool to avoid shutdown issue.
128     static auto* event_pool = new EventPool();
129     return event_pool->get(idx);
130   }
131 
getThreadPoolat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl132   TaskThreadPool* getThreadPool() {
133     static TaskThreadPool* pool = new TaskThreadPool(
134         c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
135             pinned_max_register_threads());
136     return pool;
137   }
138 
mapPagesForRegisterat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl139   void mapPagesForRegister(
140       const void* ptr,
141       size_t size,
142       size_t i,
143       size_t numThreads,
144       size_t pageSize) {
145     uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
146     uintptr_t end = (uintptr_t)start + (size / numThreads);
147     if (i == (numThreads - 1)) {
148       end = (uintptr_t)ptr + size;
149     }
150 
151     // pre-fault/map the pages by setting the first byte of the page
152     uintptr_t alignedStart =
153         (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
154     for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
155       memset((void*)p, 0, 1);
156     }
157   }
158 
registerPagesat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl159   void registerPages(const void* ptr, size_t size) {
160     AT_CUDA_CHECK(
161         cudaHostRegister((void*)ptr, (size_t)size, cudaHostRegisterDefault));
162 
163     // If host and device pointer don't match, give a warning and exit
164     void* devptr = nullptr;
165     AT_CUDA_CHECK(cudaHostGetDevicePointer(&devptr, (void*)ptr, 0));
166     TORCH_CHECK(
167         (void*)devptr == (void*)ptr,
168         "Host and device pointer dont match with cudaHostRegister. "
169         "Please dont use this feature by setting "
170         "PYTORCH_CUDA_ALLOC_CONF=use_cuda_host_register:False (default)",
171         "");
172   }
173 
allocWithCudaHostRegisterat::cuda::__anondde4d04d0111::CUDACachingHostAllocatorImpl174   void allocWithCudaHostRegister(void** ptr, size_t roundSize) {
175     // Here we do regular allocation, pre-fault/map the pages, and then do
176     // cudaHostRegister with GPU mapping flags to lock the pages, so we
177     // can minimize the cost for the cuda global lock.
178     *ptr = std::malloc(roundSize);
179 
180     // Parallelize the mapping/registering of pages to reduce wall time
181     size_t pageSize = (1 << 12); // 4kB pages
182     size_t numMapThreads = c10::cuda::CUDACachingAllocator::
183         CUDAAllocatorConfig::pinned_num_register_threads();
184     if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) {
185       // parallelize the mapping of pages with a threadpool
186       auto* pool = getThreadPool();
187       std::vector<std::promise<void>> promises;
188       std::vector<std::future<void>> futures;
189       promises.reserve(numMapThreads);
190       futures.reserve(numMapThreads);
191 
192       for (size_t i = 0; i < numMapThreads; i++) {
193         promises.emplace_back();
194         futures.push_back(promises[i].get_future());
195         auto task = [this,
196                      i,
197                      ptr,
198                      roundSize,
199                      numMapThreads,
200                      pageSize,
201                      &promises]() mutable {
202           mapPagesForRegister(
203               *ptr,
204               roundSize,
205               i, // thread task-id
206               numMapThreads,
207               pageSize);
208           // set the promise when mapping pages are done
209           promises[i].set_value();
210         };
211         pool->run(task);
212       }
213       for (auto& future : futures) {
214         future.wait();
215       }
216     } else {
217       // Map pages in the same thread
218       mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize);
219     }
220 
221     // Register the mapped pages using cudaHostRegister
222     registerPages(*ptr, roundSize);
223   }
224 };
225 
226 void raw_local_deleter(void* ptr);
227 
228 struct CUDACachingHostAllocator final
229     : public CachingHostAllocatorInterface<CUDACachingHostAllocatorImpl> {
allocateat::cuda::__anondde4d04d0111::CUDACachingHostAllocator230   at::DataPtr allocate(size_t size) override {
231     auto ptr_and_ctx = impl_->allocate(size);
232     return {
233         ptr_and_ctx.first,
234         ptr_and_ctx.second,
235         &raw_local_deleter,
236         at::DeviceType::CPU};
237   }
238 };
239 
240 CUDACachingHostAllocator caching_host_allocator;
241 
getCUDACachingHostAllocator()242 static inline CUDACachingHostAllocator& getCUDACachingHostAllocator() {
243   return caching_host_allocator;
244 }
245 
raw_local_deleter(void * ptr)246 void raw_local_deleter(void* ptr) {
247   getCUDACachingHostAllocator().free(ptr);
248 }
249 
250 } // anonymous namespace
251 
CachingHostAllocator_recordEvent(void * ptr,void * ctx,at::cuda::CUDAStream stream)252 bool CachingHostAllocator_recordEvent(
253     void* ptr,
254     void* ctx,
255     at::cuda::CUDAStream stream) {
256   return getCUDACachingHostAllocator().record_event(ptr, ctx, stream);
257 }
258 
259 // Releases cached pinned memory allocations via cudaHostFree
CachingHostAllocator_emptyCache()260 void CachingHostAllocator_emptyCache() {
261   getCUDACachingHostAllocator().empty_cache();
262 }
263 
getCachingHostAllocator()264 at::Allocator* getCachingHostAllocator() {
265   return &getCUDACachingHostAllocator();
266 }
267 
268 } // namespace at::cuda
269