1--- 2 include/cuco/detail/bitwise_compare.cuh | 1 + 3 include/cuco/detail/dynamic_map.inl | 98 ++++++++++++++++++++- 4 include/cuco/detail/dynamic_map_kernels.cuh | 83 +++++++++++++++++ 5 include/cuco/dynamic_map.cuh | 58 +++++++++++- 6 4 files changed, 236 insertions(+), 4 deletions(-) 7 8diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh 9index 3038943..4bd58c2 100644 10--- a/include/cuco/detail/bitwise_compare.cuh 11+++ b/include/cuco/detail/bitwise_compare.cuh 12@@ -18,6 +18,7 @@ 13 14 #include <cstdint> 15 #include <type_traits> 16+#include <cuco/traits.hpp> 17 18 namespace cuco { 19 namespace detail { 20diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl 21index 0c1d2e3..2425c7d 100644 22--- a/include/cuco/detail/dynamic_map.inl 23+++ b/include/cuco/detail/dynamic_map.inl 24@@ -21,30 +21,68 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map( 25 std::size_t initial_capacity, 26 sentinel::empty_key<Key> empty_key_sentinel, 27 sentinel::empty_value<Value> empty_value_sentinel, 28- Allocator const& alloc) 29+ Allocator const& alloc, 30+ cudaStream_t stream) 31 : empty_key_sentinel_(empty_key_sentinel.value), 32 empty_value_sentinel_(empty_value_sentinel.value), 33 size_(0), 34 capacity_(initial_capacity), 35 min_insert_size_(1E4), 36 max_load_factor_(0.60), 37+ counter_allocator_{alloc}, 38 alloc_{alloc} 39 { 40 submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>( 41 initial_capacity, 42 sentinel::empty_key<Key>{empty_key_sentinel}, 43 sentinel::empty_value<Value>{empty_value_sentinel}, 44- alloc)); 45+ alloc, stream)); 46 submap_views_.push_back(submaps_[0]->get_device_view()); 47 submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); 48 49 CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); 50-} // namespace cuco 51+ d_submaps_erase_num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, max_num_submaps_); 52+ CUCO_CUDA_TRY(cudaMallocHost(&h_submaps_erase_num_successes_, sizeof(atomic_ctr_type) * (max_num_submaps_))); 53+} 54+ 55+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator> 56+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map( 57+ std::size_t initial_capacity, 58+ sentinel::empty_key<Key> empty_key_sentinel, 59+ sentinel::empty_value<Value> empty_value_sentinel, 60+ sentinel::erased_key<Key> erased_key_sentinel, 61+ Allocator const& alloc, 62+ cudaStream_t stream) 63+ : empty_key_sentinel_(empty_key_sentinel.value), 64+ empty_value_sentinel_(empty_value_sentinel.value), 65+ erased_key_sentinel_{erased_key_sentinel.value}, 66+ size_(0), 67+ capacity_(initial_capacity), 68+ min_insert_size_(1E4), 69+ max_load_factor_(0.60), 70+ counter_allocator_{alloc}, 71+ alloc_{alloc} 72+{ 73+ submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>( 74+ initial_capacity, 75+ sentinel::empty_key<Key>{empty_key_sentinel}, 76+ sentinel::empty_value<Value>{empty_value_sentinel}, 77+ sentinel::erased_key<Key>{erased_key_sentinel}, 78+ alloc, stream)); 79+ submap_views_.push_back(submaps_[0]->get_device_view()); 80+ submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); 81+ 82+ CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); 83+ d_submaps_erase_num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, max_num_submaps_); 84+ CUCO_CUDA_TRY(cudaMallocHost(&h_submaps_erase_num_successes_, sizeof(atomic_ctr_type) * (max_num_submaps_))); 85+} 86 87 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator> 88 dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map() 89 { 90 CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_)); 91+ std::allocator_traits<counter_allocator_type>::deallocate(counter_allocator_, d_submaps_erase_num_successes_ , max_num_submaps_); 92+ CUCO_ASSERT_CUDA_SUCCESS(cudaFreeHost(reinterpret_cast<void *>(h_submaps_erase_num_successes_))); 93 } 94 95 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator> 96@@ -75,6 +113,9 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n) 97 98 num_elements_remaining -= max_load_factor_ * submap_capacity - min_insert_size_; 99 submap_idx++; 100+ if (submap_idx > max_num_submaps_) { 101+ throw std::runtime_error("The number of submaps exceeds the maximum[256]"); 102+ } 103 } 104 } 105 106@@ -160,4 +201,55 @@ void dynamic_map<Key, Value, Scope, Allocator>::contains( 107 CUCO_CUDA_TRY(cudaDeviceSynchronize()); 108 } 109 110+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator> 111+template <typename InputIt, typename Hash, typename KeyEqual> 112+void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first, InputIt last, 113+ cudaStream_t stream, Hash hash, KeyEqual key_equal) { 114+ auto num_keys = std::distance(first, last); 115+ if (num_keys == 0) { return; } 116+ 117+ auto constexpr block_size = 128; 118+ auto constexpr stride = 1; 119+ auto constexpr tile_size = 4; 120+ auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); 121+ 122+ static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type)); 123+ for(size_t i = 0; i < max_num_submaps_; i++) { 124+ h_submaps_erase_num_successes_[i] = 0; 125+ } 126+ 127+ CUCO_CUDA_TRY(cudaMemcpyAsync( 128+ d_submaps_erase_num_successes_, h_submaps_erase_num_successes_, submaps_.size() * sizeof(atomic_ctr_type), 129+ cudaMemcpyHostToDevice, stream)); 130+ 131+ detail::erase<block_size, tile_size><<<grid_size, block_size, sizeof(atomic_ctr_type) * submaps_.size(), stream>>>( 132+ first, first + num_keys, d_submaps_erase_num_successes_, submap_mutable_views_.data().get(), submaps_.size(), hash, key_equal); 133+ 134+ CUCO_CUDA_TRY(cudaMemcpyAsync( 135+ h_submaps_erase_num_successes_, d_submaps_erase_num_successes_, submaps_.size() * sizeof(atomic_ctr_type), 136+ cudaMemcpyDeviceToHost, stream)); 137+ 138+ CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); 139+ for(size_t submap_idx = 0; submap_idx < submaps_.size(); submap_idx++){ 140+ submaps_[submap_idx]->size_ -= h_submaps_erase_num_successes_[submap_idx]; 141+ size_ -= h_submaps_erase_num_successes_[submap_idx]; 142+ } 143+} 144+ 145+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator> 146+bool dynamic_map<Key, Value, Scope, Allocator>::get_keys_values(Key *keys, Value *values, cudaStream_t stream) { 147+ *num_successes_ = 0; 148+ int device_id; 149+ CUCO_CUDA_TRY(cudaGetDevice(&device_id)); 150+ CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); 151+ 152+ auto const block_size = 128; 153+ auto const stride = 1; 154+ auto const grid_size = (size_ + stride * block_size - 1) / (stride * block_size); 155+ detail::get_keys_values<<<grid_size, block_size, sizeof(atomic_ctr_type), stream>>>(submaps_.size(), submap_views_.data().get(), num_successes_, keys, values); 156+ 157+ CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); 158+ size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed); 159+ return h_num_successes == size_; 160+} 161 } // namespace cuco 162diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh 163index f261b49..75b2c07 100644 164--- a/include/cuco/detail/dynamic_map_kernels.cuh 165+++ b/include/cuco/detail/dynamic_map_kernels.cuh 166@@ -20,6 +20,7 @@ 167 #include <cuda/std/atomic> 168 169 #include <cooperative_groups.h> 170+#include <cuco/detail/bitwise_compare.cuh> 171 172 namespace cuco { 173 namespace detail { 174@@ -463,5 +464,87 @@ __global__ void contains(InputIt first, 175 key_idx += (gridDim.x * blockDim.x) / tile_size; 176 } 177 } 178+ 179+template <std::size_t block_size, 180+ uint32_t tile_size, 181+ typename InputIt, 182+ typename atomicT, 183+ typename viewT, 184+ typename Hash, 185+ typename KeyEqual> 186+__global__ void erase( 187+ InputIt first, InputIt last, atomicT* num_successes, viewT* views, std::size_t num_submaps, Hash hash, KeyEqual key_equal) 188+{ 189+ extern __shared__ atomicT local_num_successes[]; 190+ 191+ if (threadIdx.x < num_submaps) { 192+ local_num_successes[threadIdx.x] = 0; 193+ } 194+ __syncthreads(); 195+ 196+ auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block()); 197+ auto tid = block_size * blockIdx.x + threadIdx.x; 198+ auto it = first + tid / tile_size; 199+ 200+ while (it < last) { 201+ for (auto submap_idx = 0; submap_idx < num_submaps; ++submap_idx) { 202+ if (views[submap_idx].erase(tile, *it, hash, key_equal)) { 203+ if (tile.thread_rank() == 0) { 204+ local_num_successes[submap_idx] += 1; 205+ } 206+ break; 207+ } 208+ } 209+ it += (gridDim.x * block_size) / tile_size; 210+ } 211+ 212+ __syncthreads(); 213+ if (threadIdx.x < num_submaps) { 214+ num_successes[threadIdx.x] += local_num_successes[threadIdx.x]; 215+ } 216+} 217+ 218+template<typename Key, typename Value, typename ViewType, typename AtomicType> 219+__global__ void get_keys_values(size_t num_submaps, ViewType *submap_views, AtomicType* global_cnt, Key* keys, Value*values) { 220+ __shared__ size_t global_offset; 221+ extern __shared__ AtomicType local_cnt[]; 222+ const int default_offset_sentinel = -1; 223+ 224+ for (size_t submap_idx = 0; submap_idx < num_submaps; submap_idx++){ 225+ auto & submap_view = submap_views[submap_idx]; 226+ 227+ for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < submap_view.get_capacity(); 228+ tid += blockDim.x * gridDim.x) { 229+ if (threadIdx.x == 0) { 230+ local_cnt[0] = 0; 231+ } 232+ __syncthreads(); 233+ 234+ auto current_slot = submap_view.begin_slot() + tid; 235+ const Key & current_key = current_slot->first.load(cuda::std::memory_order_relaxed); 236+ auto const slot_not_idle = 237+ !detail::bitwise_compare(current_key, submap_view.get_empty_key_sentinel()) && 238+ !detail::bitwise_compare(current_key, submap_view.get_erased_key_sentinel()); 239+ 240+ int local_offset = default_offset_sentinel; 241+ if(slot_not_idle) { 242+ local_offset = local_cnt[0].fetch_add(1, cuda::std::memory_order_relaxed); 243+ } 244+ __syncthreads(); 245+ 246+ if (threadIdx.x == 0) { 247+ auto local_cnt_value = local_cnt[0].load(cuda::std::memory_order_relaxed); 248+ global_offset = global_cnt->fetch_add(local_cnt_value, cuda::std::memory_order_relaxed); 249+ } 250+ __syncthreads(); 251+ 252+ if (local_offset > default_offset_sentinel) { 253+ auto offset = global_offset + local_offset; 254+ keys[offset] = current_key; 255+ values[offset] = current_slot->second.load(cuda::std::memory_order_relaxed); 256+ } 257+ } 258+ } 259+} 260 } // namespace detail 261 } // namespace cuco 262diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh 263index af3ea03..9ed2f25 100644 264--- a/include/cuco/dynamic_map.cuh 265+++ b/include/cuco/dynamic_map.cuh 266@@ -105,6 +105,8 @@ class dynamic_map { 267 using atomic_ctr_type = cuda::atomic<std::size_t, Scope>; ///< Type of atomic counters 268 using view_type = typename static_map<Key, Value, Scope, Allocator>::device_view; ///< Device view type 269 using mutable_view_type = typename static_map<Key, Value, Scope, Allocator>::device_mutable_view; 270+ using counter_allocator_type = typename static_map<Key, Value, Scope, Allocator>::counter_allocator_type; 271+ 272 ///< Device mutable view type 273 274 dynamic_map(dynamic_map const&) = delete; 275@@ -135,7 +137,36 @@ class dynamic_map { 276 dynamic_map(std::size_t initial_capacity, 277 sentinel::empty_key<Key> empty_key_sentinel, 278 sentinel::empty_value<Value> empty_value_sentinel, 279- Allocator const& alloc = Allocator{}); 280+ Allocator const& alloc = Allocator{}, 281+ cudaStream_t stream = 0); 282+ 283+ /** 284+ * @brief Construct a dynamically-sized map with the specified initial capacity, growth factor and 285+ * sentinel values. 286+ * 287+ * The capacity of the map will automatically increase as the user adds key/value pairs using 288+ * `insert`. 289+ * 290+ * Capacity increases by a factor of growth_factor each time the size of the map exceeds a 291+ * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the 292+ * map's capacity grows. 293+ * 294+ * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and 295+ * undefined behavior results from attempting to insert any key/value pair 296+ * that contains either. 297+ * 298+ * @param initial_capacity The initial number of slots in the map 299+ * @param empty_key_sentinel The reserved key value for empty slots 300+ * @param empty_value_sentinel The reserved mapped value for empty slots 301+ * @param erased_key_sentinel The reserved value to denote erased slots 302+ * @param alloc Allocator used to allocate submap device storage 303+ */ 304+dynamic_map(std::size_t initial_capacity, 305+ sentinel::empty_key<Key> empty_key_sentinel, 306+ sentinel::empty_value<Value> empty_value_sentinel, 307+ sentinel::erased_key<Key> erased_key_sentinel, 308+ Allocator const& alloc = Allocator(), 309+ cudaStream_t stream = 0); 310 311 /** 312 * @brief Destroy the map and frees its contents 313@@ -227,6 +258,25 @@ class dynamic_map { 314 Hash hash = Hash{}, 315 KeyEqual key_equal = KeyEqual{}); 316 317+ template <typename InputIt, 318+ typename Hash = cuco::detail::MurmurHash3_32<key_type>, 319+ typename KeyEqual = thrust::equal_to<key_type>> 320+ void erase(InputIt first, 321+ InputIt last, 322+ cudaStream_t stream = 0, 323+ Hash hash = Hash{}, 324+ KeyEqual key_equal = KeyEqual{}); 325+ 326+ /** 327+ * @brief Get all keys and values in the hash map. 328+ * 329+ * @param keys The output parameter, pointing the buffer which will maintain all keys in the hash map. 330+ * @param values The output parameter, pointing the buffer which will maintain all values in the hash map. 331+ * @param stream The cuda stream. 332+ * @return Whether export keys and values successfully. 333+ */ 334+ bool get_keys_values(Key *keys, Value *values, cudaStream_t stream = 0); 335+ 336 /** 337 * @brief Gets the current number of elements in the map 338 * 339@@ -307,6 +357,7 @@ class dynamic_map { 340 private: 341 key_type empty_key_sentinel_{}; ///< Key value that represents an empty slot 342 mapped_type empty_value_sentinel_{}; ///< Initial value of empty slot 343+ key_type erased_key_sentinel_{}; ///< Key value that represents an erased slot 344 std::size_t size_{}; ///< Number of keys in the map 345 std::size_t capacity_{}; ///< Maximum number of keys that can be inserted 346 float max_load_factor_{}; ///< Max load factor before capacity growth 347@@ -319,6 +370,11 @@ class dynamic_map { 348 std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert 349 atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert 350 Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage 351+ 352+ counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate counters 353+ atomic_ctr_type* d_submaps_erase_num_successes_; ///< number of successfully erased keys on erase, atomic on device. 354+ atomic_ctr_type* h_submaps_erase_num_successes_; ///< number of successfully erased keys on erase, atomic on host. 355+ const size_t max_num_submaps_ = 256; ///< The max number of submaps. 356 }; 357 } // namespace cuco 358