• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1---
2 include/cuco/detail/bitwise_compare.cuh     |  1 +
3 include/cuco/detail/dynamic_map.inl         | 98 ++++++++++++++++++++-
4 include/cuco/detail/dynamic_map_kernels.cuh | 83 +++++++++++++++++
5 include/cuco/dynamic_map.cuh                | 58 +++++++++++-
6 4 files changed, 236 insertions(+), 4 deletions(-)
7
8diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh
9index 3038943..4bd58c2 100644
10--- a/include/cuco/detail/bitwise_compare.cuh
11+++ b/include/cuco/detail/bitwise_compare.cuh
12@@ -18,6 +18,7 @@
13
14 #include <cstdint>
15 #include <type_traits>
16+#include <cuco/traits.hpp>
17
18 namespace cuco {
19 namespace detail {
20diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
21index 0c1d2e3..2425c7d 100644
22--- a/include/cuco/detail/dynamic_map.inl
23+++ b/include/cuco/detail/dynamic_map.inl
24@@ -21,30 +21,68 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
25   std::size_t initial_capacity,
26   sentinel::empty_key<Key> empty_key_sentinel,
27   sentinel::empty_value<Value> empty_value_sentinel,
28-  Allocator const& alloc)
29+  Allocator const& alloc,
30+  cudaStream_t stream)
31   : empty_key_sentinel_(empty_key_sentinel.value),
32     empty_value_sentinel_(empty_value_sentinel.value),
33     size_(0),
34     capacity_(initial_capacity),
35     min_insert_size_(1E4),
36     max_load_factor_(0.60),
37+    counter_allocator_{alloc},
38     alloc_{alloc}
39 {
40   submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
41     initial_capacity,
42     sentinel::empty_key<Key>{empty_key_sentinel},
43     sentinel::empty_value<Value>{empty_value_sentinel},
44-    alloc));
45+    alloc, stream));
46   submap_views_.push_back(submaps_[0]->get_device_view());
47   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
48
49   CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
50-}  // namespace cuco
51+  d_submaps_erase_num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, max_num_submaps_);
52+  CUCO_CUDA_TRY(cudaMallocHost(&h_submaps_erase_num_successes_, sizeof(atomic_ctr_type) * (max_num_submaps_)));
53+}
54+
55+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
56+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
57+  std::size_t initial_capacity,
58+  sentinel::empty_key<Key> empty_key_sentinel,
59+  sentinel::empty_value<Value> empty_value_sentinel,
60+  sentinel::erased_key<Key> erased_key_sentinel,
61+  Allocator const& alloc,
62+  cudaStream_t stream)
63+  : empty_key_sentinel_(empty_key_sentinel.value),
64+    empty_value_sentinel_(empty_value_sentinel.value),
65+    erased_key_sentinel_{erased_key_sentinel.value},
66+    size_(0),
67+    capacity_(initial_capacity),
68+    min_insert_size_(1E4),
69+    max_load_factor_(0.60),
70+    counter_allocator_{alloc},
71+    alloc_{alloc}
72+{
73+  submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
74+    initial_capacity,
75+    sentinel::empty_key<Key>{empty_key_sentinel},
76+    sentinel::empty_value<Value>{empty_value_sentinel},
77+    sentinel::erased_key<Key>{erased_key_sentinel},
78+    alloc, stream));
79+  submap_views_.push_back(submaps_[0]->get_device_view());
80+  submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
81+
82+  CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
83+  d_submaps_erase_num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, max_num_submaps_);
84+  CUCO_CUDA_TRY(cudaMallocHost(&h_submaps_erase_num_successes_, sizeof(atomic_ctr_type) * (max_num_submaps_)));
85+}
86
87 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
88 dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map()
89 {
90   CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_));
91+  std::allocator_traits<counter_allocator_type>::deallocate(counter_allocator_, d_submaps_erase_num_successes_ , max_num_submaps_);
92+  CUCO_ASSERT_CUDA_SUCCESS(cudaFreeHost(reinterpret_cast<void *>(h_submaps_erase_num_successes_)));
93 }
94
95 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
96@@ -75,6 +113,9 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
97
98     num_elements_remaining -= max_load_factor_ * submap_capacity - min_insert_size_;
99     submap_idx++;
100+    if (submap_idx > max_num_submaps_) {
101+      throw std::runtime_error("The number of submaps exceeds the maximum[256]");
102+    }
103   }
104 }
105
106@@ -160,4 +201,55 @@ void dynamic_map<Key, Value, Scope, Allocator>::contains(
107   CUCO_CUDA_TRY(cudaDeviceSynchronize());
108 }
109
110+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
111+template <typename InputIt, typename Hash, typename KeyEqual>
112+void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first, InputIt last,
113+      cudaStream_t stream, Hash hash, KeyEqual key_equal) {
114+  auto num_keys = std::distance(first, last);
115+  if (num_keys == 0) { return; }
116+
117+  auto constexpr block_size = 128;
118+  auto constexpr stride     = 1;
119+  auto constexpr tile_size  = 4;
120+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
121+
122+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
123+  for(size_t i = 0; i < max_num_submaps_; i++) {
124+    h_submaps_erase_num_successes_[i] = 0;
125+  }
126+
127+  CUCO_CUDA_TRY(cudaMemcpyAsync(
128+    d_submaps_erase_num_successes_, h_submaps_erase_num_successes_, submaps_.size() * sizeof(atomic_ctr_type),
129+    cudaMemcpyHostToDevice, stream));
130+
131+  detail::erase<block_size, tile_size><<<grid_size, block_size, sizeof(atomic_ctr_type) * submaps_.size(), stream>>>(
132+    first, first + num_keys, d_submaps_erase_num_successes_, submap_mutable_views_.data().get(), submaps_.size(), hash, key_equal);
133+
134+  CUCO_CUDA_TRY(cudaMemcpyAsync(
135+    h_submaps_erase_num_successes_, d_submaps_erase_num_successes_, submaps_.size() * sizeof(atomic_ctr_type),
136+    cudaMemcpyDeviceToHost, stream));
137+
138+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
139+  for(size_t submap_idx = 0; submap_idx < submaps_.size(); submap_idx++){
140+    submaps_[submap_idx]->size_ -= h_submaps_erase_num_successes_[submap_idx];
141+    size_ -= h_submaps_erase_num_successes_[submap_idx];
142+  }
143+}
144+
145+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
146+bool dynamic_map<Key, Value, Scope, Allocator>::get_keys_values(Key *keys, Value *values, cudaStream_t stream) {
147+  *num_successes_ = 0;
148+  int device_id;
149+  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
150+  CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
151+
152+  auto const block_size = 128;
153+  auto const stride     = 1;
154+  auto const grid_size  = (size_ + stride * block_size - 1) / (stride * block_size);
155+  detail::get_keys_values<<<grid_size, block_size, sizeof(atomic_ctr_type), stream>>>(submaps_.size(), submap_views_.data().get(), num_successes_, keys, values);
156+
157+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
158+  size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed);
159+  return h_num_successes == size_;
160+}
161 }  // namespace cuco
162diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
163index f261b49..75b2c07 100644
164--- a/include/cuco/detail/dynamic_map_kernels.cuh
165+++ b/include/cuco/detail/dynamic_map_kernels.cuh
166@@ -20,6 +20,7 @@
167 #include <cuda/std/atomic>
168
169 #include <cooperative_groups.h>
170+#include <cuco/detail/bitwise_compare.cuh>
171
172 namespace cuco {
173 namespace detail {
174@@ -463,5 +464,87 @@ __global__ void contains(InputIt first,
175     key_idx += (gridDim.x * blockDim.x) / tile_size;
176   }
177 }
178+
179+template <std::size_t block_size,
180+          uint32_t tile_size,
181+          typename InputIt,
182+          typename atomicT,
183+          typename viewT,
184+          typename Hash,
185+          typename KeyEqual>
186+__global__ void erase(
187+  InputIt first, InputIt last, atomicT* num_successes, viewT* views, std::size_t num_submaps, Hash hash, KeyEqual key_equal)
188+{
189+  extern __shared__ atomicT local_num_successes[];
190+
191+  if (threadIdx.x < num_submaps) {
192+    local_num_successes[threadIdx.x] = 0;
193+  }
194+  __syncthreads();
195+
196+  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
197+  auto tid  = block_size * blockIdx.x + threadIdx.x;
198+  auto it   = first + tid / tile_size;
199+
200+  while (it < last) {
201+    for (auto submap_idx = 0; submap_idx < num_submaps; ++submap_idx) {
202+      if (views[submap_idx].erase(tile, *it, hash, key_equal)) {
203+        if (tile.thread_rank() == 0) {
204+          local_num_successes[submap_idx] += 1;
205+        }
206+        break;
207+      }
208+    }
209+    it += (gridDim.x * block_size) / tile_size;
210+  }
211+
212+  __syncthreads();
213+  if (threadIdx.x < num_submaps) {
214+    num_successes[threadIdx.x] += local_num_successes[threadIdx.x];
215+  }
216+}
217+
218+template<typename Key, typename Value, typename ViewType, typename AtomicType>
219+__global__ void get_keys_values(size_t num_submaps, ViewType *submap_views, AtomicType* global_cnt, Key* keys, Value*values) {
220+  __shared__ size_t global_offset;
221+  extern __shared__ AtomicType local_cnt[];
222+  const int default_offset_sentinel = -1;
223+
224+  for (size_t submap_idx = 0; submap_idx < num_submaps; submap_idx++){
225+    auto & submap_view = submap_views[submap_idx];
226+
227+    for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < submap_view.get_capacity();
228+         tid += blockDim.x * gridDim.x) {
229+      if (threadIdx.x == 0) {
230+        local_cnt[0] = 0;
231+      }
232+      __syncthreads();
233+
234+      auto current_slot = submap_view.begin_slot() + tid;
235+      const Key & current_key = current_slot->first.load(cuda::std::memory_order_relaxed);
236+      auto const slot_not_idle =
237+        !detail::bitwise_compare(current_key, submap_view.get_empty_key_sentinel()) &&
238+        !detail::bitwise_compare(current_key, submap_view.get_erased_key_sentinel());
239+
240+      int local_offset = default_offset_sentinel;
241+      if(slot_not_idle) {
242+        local_offset = local_cnt[0].fetch_add(1, cuda::std::memory_order_relaxed);
243+      }
244+      __syncthreads();
245+
246+      if (threadIdx.x == 0) {
247+        auto local_cnt_value = local_cnt[0].load(cuda::std::memory_order_relaxed);
248+        global_offset = global_cnt->fetch_add(local_cnt_value, cuda::std::memory_order_relaxed);
249+      }
250+      __syncthreads();
251+
252+      if (local_offset > default_offset_sentinel) {
253+        auto offset = global_offset + local_offset;
254+          keys[offset] = current_key;
255+          values[offset] = current_slot->second.load(cuda::std::memory_order_relaxed);
256+      }
257+    }
258+  }
259+}
260 }  // namespace detail
261 }  // namespace cuco
262diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
263index af3ea03..9ed2f25 100644
264--- a/include/cuco/dynamic_map.cuh
265+++ b/include/cuco/dynamic_map.cuh
266@@ -105,6 +105,8 @@ class dynamic_map {
267   using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Type of atomic counters
268   using view_type = typename static_map<Key, Value, Scope, Allocator>::device_view;  ///< Device view type
269   using mutable_view_type = typename static_map<Key, Value, Scope, Allocator>::device_mutable_view;
270+  using counter_allocator_type = typename static_map<Key, Value, Scope, Allocator>::counter_allocator_type;
271+
272   ///< Device mutable view type
273
274   dynamic_map(dynamic_map const&) = delete;
275@@ -135,7 +137,36 @@ class dynamic_map {
276   dynamic_map(std::size_t initial_capacity,
277               sentinel::empty_key<Key> empty_key_sentinel,
278               sentinel::empty_value<Value> empty_value_sentinel,
279-              Allocator const& alloc = Allocator{});
280+              Allocator const& alloc = Allocator{},
281+              cudaStream_t stream = 0);
282+
283+  /**
284+   * @brief Construct a dynamically-sized map with the specified initial capacity, growth factor and
285+   * sentinel values.
286+   *
287+   * The capacity of the map will automatically increase as the user adds key/value pairs using
288+   * `insert`.
289+   *
290+   * Capacity increases by a factor of growth_factor each time the size of the map exceeds a
291+   * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the
292+   * map's capacity grows.
293+   *
294+   * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
295+   * undefined behavior results from attempting to insert any key/value pair
296+   * that contains either.
297+   *
298+   * @param initial_capacity The initial number of slots in the map
299+   * @param empty_key_sentinel The reserved key value for empty slots
300+   * @param empty_value_sentinel The reserved mapped value for empty slots
301+   * @param erased_key_sentinel The reserved value to denote erased slots
302+   * @param alloc Allocator used to allocate submap device storage
303+   */
304+dynamic_map(std::size_t initial_capacity,
305+            sentinel::empty_key<Key> empty_key_sentinel,
306+            sentinel::empty_value<Value> empty_value_sentinel,
307+            sentinel::erased_key<Key> erased_key_sentinel,
308+            Allocator const& alloc = Allocator(),
309+            cudaStream_t stream = 0);
310
311   /**
312    * @brief Destroy the map and frees its contents
313@@ -227,6 +258,25 @@ class dynamic_map {
314                 Hash hash          = Hash{},
315                 KeyEqual key_equal = KeyEqual{});
316
317+  template <typename InputIt,
318+            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
319+            typename KeyEqual = thrust::equal_to<key_type>>
320+  void erase(InputIt first,
321+             InputIt last,
322+             cudaStream_t stream = 0,
323+             Hash hash           = Hash{},
324+             KeyEqual key_equal  = KeyEqual{});
325+
326+  /**
327+   * @brief Get all keys and values in the hash map.
328+   *
329+   * @param keys The output parameter, pointing the buffer which will maintain all keys in the hash map.
330+   * @param values The output parameter, pointing the buffer which will maintain all values in the hash map.
331+   * @param stream The cuda stream.
332+   * @return Whether export keys and values successfully.
333+   */
334+  bool get_keys_values(Key *keys, Value *values, cudaStream_t stream = 0);
335+
336   /**
337    * @brief Gets the current number of elements in the map
338    *
339@@ -307,6 +357,7 @@ class dynamic_map {
340  private:
341   key_type empty_key_sentinel_{};       ///< Key value that represents an empty slot
342   mapped_type empty_value_sentinel_{};  ///< Initial value of empty slot
343+  key_type erased_key_sentinel_{};  ///< Key value that represents an erased slot
344   std::size_t size_{};                  ///< Number of keys in the map
345   std::size_t capacity_{};              ///< Maximum number of keys that can be inserted
346   float max_load_factor_{};             ///< Max load factor before capacity growth
347@@ -319,6 +370,11 @@ class dynamic_map {
348   std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
349   atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
350   Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
351+
352+  counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate counters
353+  atomic_ctr_type* d_submaps_erase_num_successes_; ///< number of successfully erased keys on erase, atomic on device.
354+  atomic_ctr_type* h_submaps_erase_num_successes_; ///< number of successfully erased keys on erase, atomic on host.
355+  const size_t max_num_submaps_ = 256;  ///< The max number of submaps.
356 };
357 }  // namespace cuco
358