53 template <
typename Hash,
typename KeyEq>
64 void Rehash(
size_t buckets)
override;
66 void Insert(
const void* input_keys,
67 const void* input_values,
70 size_t count)
override;
72 void Activate(
const void* input_keys,
75 size_t count)
override;
77 void Find(
const void* input_keys,
80 size_t count)
override;
82 void Erase(
const void* input_keys,
84 size_t count)
override;
89 const bool* input_masks,
92 size_t count)
override;
95 const bool* input_masks,
96 const void* input_values,
97 size_t count)
override;
103 size_t Size()
const override;
110 std::shared_ptr<InternalKvPairManager>
kv_mgr_;
116 const void* input_values,
121 void Allocate(
size_t bucket_count,
size_t capacity);
124 template <
typename Hash,
typename KeyEq>
126 size_t init_capacity,
131 init_buckets, init_capacity, dsize_key, dsize_value, device) {
132 Allocate(init_buckets, init_capacity);
135 template <
typename Hash,
typename KeyEq>
140 template <
typename Hash,
typename KeyEq>
142 size_t iterator_count =
Size();
144 void* output_keys =
nullptr;
145 void* output_values =
nullptr;
147 bool* output_masks =
nullptr;
149 if (iterator_count > 0) {
157 sizeof(
bool) * iterator_count, this->
device_));
161 output_values, iterator_count);
164 float avg_capacity_per_bucket =
167 Allocate(buckets,
size_t(std::ceil(buckets * avg_capacity_per_bucket)));
169 if (iterator_count > 0) {
170 InsertImpl(output_keys, output_values, output_iterators, output_masks,
180 template <
typename Hash,
typename KeyEq>
182 const void* input_values,
188 float avg_capacity_per_bucket =
190 size_t expected_buckets =
192 size_t(std::ceil(new_size / avg_capacity_per_bucket)));
196 InsertImpl(input_keys, input_values, output_iterators, output_masks, count);
199 template <
typename Hash,
typename KeyEq>
206 float avg_capacity_per_bucket =
208 size_t expected_buckets =
210 size_t(std::ceil(new_size / avg_capacity_per_bucket)));
214 InsertImpl(input_keys,
nullptr, output_iterators, output_masks, count);
217 template <
typename Hash,
typename KeyEq>
222 if (count == 0)
return;
226 const size_t num_blocks = (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
227 FindKernel<<<num_blocks, kThreadsPerBlock>>>(
233 template <
typename Hash,
typename KeyEq>
237 if (count == 0)
return;
240 auto iterator_addrs =
static_cast<addr_t*
>(
243 const size_t num_blocks = (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
244 EraseKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
246 EraseKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
254 template <
typename Hash,
typename KeyEq>
258 cudaMemset(iterator_count, 0,
sizeof(
uint32_t));
260 const size_t num_blocks =
261 (
gpu_context_.bucket_count_ * kWarpSize + kThreadsPerBlock - 1) /
263 GetIteratorsKernel<<<num_blocks, kThreadsPerBlock>>>(
273 return static_cast<size_t>(ret);
276 template <
typename Hash,
typename KeyEq>
279 const bool* input_masks,
282 size_t iterator_count) {
283 if (iterator_count == 0)
return;
285 const size_t num_blocks =
286 (iterator_count + kThreadsPerBlock - 1) / kThreadsPerBlock;
287 UnpackIteratorsKernel<<<num_blocks, kThreadsPerBlock>>>(
288 input_iterators, input_masks, output_keys, output_values,
294 template <
typename Hash,
typename KeyEq>
296 const bool* input_masks,
297 const void* input_values,
298 size_t iterator_count) {
299 if (iterator_count == 0)
return;
301 const size_t num_blocks =
302 (iterator_count + kThreadsPerBlock - 1) / kThreadsPerBlock;
303 AssignIteratorsKernel<<<num_blocks, kThreadsPerBlock>>>(
304 input_iterators, input_masks, input_values, this->
dsize_value_,
310 template <
typename Hash,
typename KeyEq>
312 thrust::device_vector<size_t> elems_per_bucket(
gpu_context_.bucket_count_);
313 thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);
315 const size_t num_blocks =
316 (
gpu_context_.capacity_ + kThreadsPerBlock - 1) / kThreadsPerBlock;
317 CountElemsPerBucketKernel<<<num_blocks, kThreadsPerBlock>>>(
318 gpu_context_, thrust::raw_pointer_cast(elems_per_bucket.data()));
322 std::vector<size_t> result(gpu_context_.bucket_count_);
323 thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
325 return std::move(result);
328 template <
typename Hash,
typename KeyEq>
333 template <
typename Hash,
typename KeyEq>
335 return *thrust::device_ptr<int>(
gpu_context_.kv_mgr_ctx_.heap_counter_);
338 template <
typename Hash,
typename KeyEq>
340 const void* input_values,
344 if (count == 0)
return;
345 auto iterator_addrs =
static_cast<addr_t*
>(
350 int prev_heap_counter =
351 *thrust::device_ptr<int>(
gpu_context_.kv_mgr_ctx_.heap_counter_);
352 *thrust::device_ptr<int>(
gpu_context_.kv_mgr_ctx_.heap_counter_) =
353 prev_heap_counter + count;
355 const size_t num_blocks = (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
356 InsertKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
358 InsertKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
360 InsertKernelPass2<<<num_blocks, kThreadsPerBlock>>>(
361 gpu_context_, input_values, iterator_addrs, output_iterators,
362 output_masks,
count);
369 template <
typename Hash,
typename KeyEq>
375 kv_mgr_ = std::make_shared<InternalKvPairManager>(
386 sizeof(
Slab) * this->bucket_count_));
388 gpu_context_.Setup(this->bucket_count_, this->capacity_, this->dsize_key_,
389 this->dsize_value_, node_mgr_->gpu_context_,
~CUDAHashmap()
Definition: HashmapCUDA.h:136
size_t Size() const override
Definition: HashmapCUDA.h:334
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:554
void Erase(const void *input_keys, bool *output_masks, size_t count) override
Parallel erase a contiguous array of keys.
Definition: HashmapCUDA.h:234
static void Free(void *ptr, const Device &device)
Definition: MemoryManager.cpp:44
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:57
std::shared_ptr< InternalNodeManager > node_mgr_
Definition: HashmapCUDA.h:111
void Insert(const void *input_keys, const void *input_values, iterator_t *output_iterators, bool *output_masks, size_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: HashmapCUDA.h:181
size_t bucket_count_
Definition: DeviceHashmap.h:172
static void MemcpyToHost(void *host_ptr, const void *src_ptr, const Device &src_device, size_t num_bytes)
Same as Memcpy, but with host (CPU:0) as default dst_device.
Definition: MemoryManager.cpp:88
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:40
void Rehash(size_t buckets) override
Definition: HashmapCUDA.h:141
Definition: HashmapCUDAImpl.h:36
std::vector< size_t > BucketSizes() const override
Definition: HashmapCUDA.h:311
Base class: shared interface.
Definition: DeviceHashmap.h:91
void Activate(const void *input_keys, iterator_t *output_iterators, bool *output_masks, size_t count) override
Definition: HashmapCUDA.h:200
void Allocate(size_t bucket_count, size_t capacity)
Definition: HashmapCUDA.h:370
size_t capacity_
Definition: DeviceHashmap.h:173
Definition: HashmapCUDA.h:54
int count
Definition: FilePCD.cpp:61
std::shared_ptr< InternalKvPairManager > kv_mgr_
Definition: HashmapCUDA.h:110
Device device_
Definition: DeviceHashmap.h:176
void InsertImpl(const void *input_keys, const void *input_values, iterator_t *output_iterators, bool *output_masks, size_t count)
Definition: HashmapCUDA.h:339
Definition: PinholeCameraIntrinsic.cpp:35
void Find(const void *input_keys, iterator_t *output_iterators, bool *output_masks, size_t count) override
Parallel find a contiguous array of keys.
Definition: HashmapCUDA.h:218
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle float
Definition: K4aPlugin.cpp:462
uint32_t addr_t
Definition: Traits.h:49
CUDAHashmap(size_t init_buckets, size_t init_capacity, size_t dsize_key, size_t dsize_value, const Device &device)
Definition: HashmapCUDA.h:125
float LoadFactor() const override
Return size / bucket_count.
Definition: HashmapCUDA.h:329
size_t GetIterators(iterator_t *output_iterators) override
Parallel collect all iterators in the hash table.
Definition: HashmapCUDA.h:255
Definition: InternalNodeManager.h:65
CUDAHashmapImplContext< Hash, KeyEq > gpu_context_
Definition: HashmapCUDA.h:108
size_t dsize_key_
Definition: DeviceHashmap.h:174
void AssignIterators(iterator_t *input_iterators, const bool *input_masks, const void *input_values, size_t count) override
Parallel assign iterators in-place with associated values.
Definition: HashmapCUDA.h:295
void UnpackIterators(const iterator_t *input_iterators, const bool *input_masks, void *output_keys, void *output_values, size_t count) override
Parallel unpack iterators to contiguous arrays of keys and/or values.
Definition: HashmapCUDA.h:277
size_t dsize_value_
Definition: DeviceHashmap.h:175