36 template <
typename Hash,
typename KeyEq>
41 __host__
void Setup(int64_t init_buckets,
42 int64_t init_capacity,
46 const CUDAHashmapBufferContext& kv_mgr_ctx);
48 __device__
bool Insert(
bool lane_active,
87 bucket_id * kWarpSize + lane_id;
105 template <
typename Hash,
typename KeyEq>
107 const void* input_keys,
109 int heap_counter_prev,
112 template <
typename Hash,
typename KeyEq>
114 const void* input_keys,
119 template <
typename Hash,
typename KeyEq>
121 const void* input_values,
126 template <
typename Hash,
typename KeyEq>
128 const void* input_keys,
133 template <
typename Hash,
typename KeyEq>
135 const void* input_keys,
140 template <
typename Hash,
typename KeyEq>
146 template <
typename Hash,
typename KeyEq>
152 template <
typename Hash,
typename KeyEq>
155 int64_t* bucket_elem_counts);
157 template <
typename Hash,
typename KeyEq>
161 template <
typename Hash,
typename KeyEq>
163 int64_t init_buckets,
164 int64_t init_capacity,
168 const CUDAHashmapBufferContext& pair_allocator_ctx) {
177 hash_fn_.key_size_in_int_ = dsize_key /
sizeof(
int);
178 cmp_fn_.key_size_in_int_ = dsize_key /
sizeof(
int);
181 template <
typename Hash,
typename KeyEq>
190 uint32_t curr_slab_ptr = kHeadSlabAddr;
191 uint8_t src_key[kMaxKeyByteSize];
196 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
199 (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
200 uint32_t src_lane = __ffs(work_queue) - 1;
202 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
208 (curr_slab_ptr == kHeadSlabAddr)
217 if (lane_found >= 0) {
218 if (lane_id == src_lane) {
225 else if (lane_empty >= 0) {
226 if (lane_id == src_lane) {
229 (curr_slab_ptr == kHeadSlabAddr)
235 addr_t old_iterator_addr =
236 atomicCAS((
unsigned int*)unit_data_ptr, kEmptyNodeAddr,
241 if (old_iterator_addr == kEmptyNodeAddr) {
257 addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
258 kNextSlabPtrLaneId, kWarpSize);
261 if (next_slab_ptr != kEmptySlabAddr) {
262 curr_slab_ptr = next_slab_ptr;
269 if (lane_id == kNextSlabPtrLaneId) {
271 (curr_slab_ptr == kHeadSlabAddr)
273 src_bucket, kNextSlabPtrLaneId)
278 addr_t old_next_slab_ptr =
279 atomicCAS((
unsigned int*)unit_data_ptr,
280 kEmptySlabAddr, new_next_slab_ptr);
284 if (old_next_slab_ptr != kEmptySlabAddr) {
293 prev_work_queue = work_queue;
299 template <
typename Hash,
typename KeyEq>
304 const void* query_key) {
306 uint32_t prev_work_queue = work_queue;
307 uint32_t curr_slab_ptr = kHeadSlabAddr;
309 addr_t iterator = kNullAddr;
313 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
316 (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
317 uint32_t src_lane = __ffs(work_queue) - 1;
319 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
321 uint8_t src_key[kMaxKeyByteSize];
326 (curr_slab_ptr == kHeadSlabAddr)
334 if (lane_found >= 0) {
336 addr_t found_pair_internal_ptr = __shfl_sync(
337 kSyncLanesMask, unit_data, lane_found, kWarpSize);
339 if (lane_id == src_lane) {
343 iterator = found_pair_internal_ptr;
351 addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
352 kNextSlabPtrLaneId, kWarpSize);
355 if (next_slab_ptr == kEmptySlabAddr) {
356 if (lane_id == src_lane) {
362 curr_slab_ptr = next_slab_ptr;
366 prev_work_queue = work_queue;
372 template <
typename Hash,
typename KeyEq>
380 uint32_t curr_slab_ptr = kHeadSlabAddr;
381 uint8_t src_key[kMaxKeyByteSize];
387 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
390 (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
391 uint32_t src_lane = __ffs(work_queue) - 1;
393 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
398 (curr_slab_ptr == kHeadSlabAddr)
406 if (lane_found >= 0) {
407 if (lane_id == src_lane) {
409 (curr_slab_ptr == kHeadSlabAddr)
415 uint32_t pair_to_delete = atomicExch(
416 (
unsigned int*)unit_data_ptr, kEmptyNodeAddr);
417 mask = pair_to_delete != kEmptyNodeAddr;
418 iterator_addr = pair_to_delete;
422 addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
423 kNextSlabPtrLaneId, kWarpSize);
424 if (next_slab_ptr == kEmptySlabAddr) {
426 if (lane_id == src_lane) {
430 curr_slab_ptr = next_slab_ptr;
433 prev_work_queue = work_queue;
439 template <
typename Hash,
typename KeyEq>
441 const void* key_ptr,
uint32_t lane_id,
void* ret_key_ptr) {
442 auto dst_key_ptr =
static_cast<int*
>(ret_key_ptr);
443 auto src_key_ptr =
static_cast<const int*
>(key_ptr);
444 for (
int i = 0; i <
hash_fn_.key_size_in_int_; ++i) {
446 __shfl_sync(kSyncLanesMask, src_key_ptr[i], lane_id, kWarpSize);
450 template <
typename Hash,
typename KeyEq>
455 ((1 << lane_id) & kNodePtrLanesMask)
457 && (ptr != kEmptyNodeAddr)
461 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_found)) - 1;
464 template <
typename Hash,
typename KeyEq>
467 bool is_lane_empty = (ptr == kEmptyNodeAddr);
468 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_empty)) - 1;
471 template <
typename Hash,
typename KeyEq>
477 template <
typename Hash,
typename KeyEq>
483 template <
typename Hash,
typename KeyEq>
489 template <
typename Hash,
typename KeyEq>
491 const void* input_keys,
493 int heap_counter_prev,
495 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
500 hash_ctx.
kv_mgr_ctx_.heap_[heap_counter_prev + tid];
502 hash_ctx.
kv_mgr_ctx_.ExtractIterator(iterator_addr);
505 static_cast<const uint8_t*>(input_keys) +
508 output_addrs[tid] = iterator_addr;
512 template <
typename Hash,
typename KeyEq>
514 const void* input_keys,
518 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
521 if (tid - lane_id >= count) {
527 bool lane_active =
false;
532 uint8_t dummy_key[kMaxKeyByteSize];
533 const void* key =
reinterpret_cast<const void*
>(dummy_key);
537 key =
static_cast<const uint8_t*
>(input_keys) +
539 iterator_addr = output_addrs[tid];
544 bool mask = hash_ctx.
Insert(lane_active, lane_id, bucket_id, key,
548 output_masks[tid] = mask;
552 template <
typename Hash,
typename KeyEq>
554 const void* input_values,
558 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
561 addr_t iterator_addr = output_addrs[tid];
563 if (output_masks[tid]) {
565 hash_ctx.
kv_mgr_ctx_.ExtractIterator(iterator_addr);
568 if (input_values !=
nullptr) {
570 static_cast<const uint8_t*>(input_values) +
581 template <
typename Hash,
typename KeyEq>
583 const void* input_keys,
587 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
588 uint32_t lane_id = threadIdx.x & 0x1F;
591 if ((tid - lane_id) >= count) {
598 bool lane_active =
false;
602 uint8_t dummy_key[kMaxKeyByteSize];
603 const void* key =
reinterpret_cast<const void*
>(dummy_key);
608 key =
static_cast<const uint8_t*
>(input_keys) +
613 result = hash_ctx.
Find(lane_active, lane_id, bucket_id, key);
616 output_addrs[tid] = result.
first;
617 output_masks[tid] = result.
second;
621 template <
typename Hash,
typename KeyEq>
623 const void* input_keys,
627 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
628 uint32_t lane_id = threadIdx.x & 0x1F;
630 if (tid - lane_id >= count) {
636 bool lane_active =
false;
639 uint8_t dummy_key[kMaxKeyByteSize];
640 const void* key =
reinterpret_cast<const void*
>(dummy_key);
644 key =
static_cast<const uint8_t*
>(input_keys) +
649 auto result = hash_ctx.
Erase(lane_active, lane_id, bucket_id, key);
652 output_addrs[tid] = result.first;
653 output_masks[tid] = result.second;
657 template <
typename Hash,
typename KeyEq>
662 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
663 if (tid < count && output_masks[tid]) {
664 hash_ctx.
kv_mgr_ctx_.DeviceFree(output_addrs[tid]);
668 template <
typename Hash,
typename KeyEq>
673 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
674 uint32_t lane_id = threadIdx.x & 0x1F;
686 bool is_active = src_unit_data != kEmptyNodeAddr;
688 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
689 uint32_t index = atomicAdd(output_iterator_count, 1);
690 output_addrs[index] = src_unit_data;
693 addr_t next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
697 while (next != kEmptySlabAddr) {
699 is_active = (src_unit_data != kEmptyNodeAddr);
701 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
702 uint32_t index = atomicAdd(output_iterator_count, 1);
703 output_addrs[index] = src_unit_data;
705 next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
710 template <
typename Hash,
typename KeyEq>
713 int64_t* bucket_elem_counts) {
714 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
715 uint32_t lane_id = threadIdx.x & 0x1F;
731 __ballot_sync(kNodePtrLanesMask, src_unit_data != kEmptyNodeAddr));
732 addr_t next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
736 while (next != kEmptySlabAddr) {
738 count += __popc(__ballot_sync(kNodePtrLanesMask,
739 src_unit_data != kEmptyNodeAddr));
740 next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
746 bucket_elem_counts[bucket_id] =
count;
CUDAHashmapImplContext()
Definition: HashmapCUDAImpl.h:158
void * first
Definition: Traits.h:54
__host__ void Setup(int64_t init_buckets, int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const InternalNodeManagerContext &node_mgr_ctx, const CUDAHashmapBufferContext &kv_mgr_ctx)
Definition: HashmapCUDAImpl.h:162
int64_t dsize_value_
Definition: HashmapCUDAImpl.h:97
Hash hash_fn_
Definition: HashmapCUDAImpl.h:91
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: InternalNodeManager.h:97
__global__ void InsertKernelPass2(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:553
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
__global__ void EraseKernelPass0(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:622
__device__ Pair< addr_t, bool > Find(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const void *key_ptr)
Definition: HashmapCUDAImpl.h:300
__global__ void CountElemsPerBucketKernel(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, int64_t *bucket_elem_counts)
Definition: HashmapCUDAImpl.h:711
__device__ void WarpSyncKey(const void *key_ptr, uint32_t lane_id, void *ret_key_ptr)
Definition: HashmapCUDAImpl.h:440
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:398
__device__ addr_t AllocateSlab(uint32_t lane_id)
Definition: HashmapCUDAImpl.h:479
KeyEq cmp_fn_
Definition: HashmapCUDAImpl.h:92
Second second
Definition: Traits.h:61
__device__ void FreeUntouched(addr_t ptr)
Definition: InternalNodeManager.h:153
int64_t dsize_key_
Definition: HashmapCUDAImpl.h:96
__device__ int64_t ComputeBucket(const void *key_ptr) const
Definition: HashmapCUDAImpl.h:473
Definition: HashmapCUDAImpl.h:37
#define MEMCPY_AS_INTS(dst, src, num_bytes)
Definition: Macros.h:97
__device__ addr_t * get_unit_ptr_from_list_head(uint32_t bucket_id, uint32_t lane_id)
Definition: HashmapCUDAImpl.h:84
Slab * bucket_list_head_
Definition: HashmapCUDAImpl.h:99
math::float4 next
Definition: LineSetBuffers.cpp:63
__global__ void GetActiveIndicesKernel(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, addr_t *output_addrs, uint32_t *output_iterator_count)
Definition: HashmapCUDAImpl.h:669
__device__ int32_t WarpFindEmpty(addr_t unit_data)
Definition: HashmapCUDAImpl.h:466
First first
Definition: Traits.h:60
int64_t capacity_
Definition: HashmapCUDAImpl.h:95
Definition: InternalNodeManager.h:67
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c int
Definition: K4aPlugin.cpp:479
__device__ void FreeSlab(addr_t slab_ptr)
Definition: HashmapCUDAImpl.h:484
int count
Definition: FilePCD.cpp:61
__device__ int32_t WarpFindKey(const void *src_key_ptr, uint32_t lane_id, addr_t ptr)
Definition: HashmapCUDAImpl.h:451
__global__ void EraseKernelPass1(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:658
Definition: PinholeCameraIntrinsic.cpp:35
__global__ void FindKernel(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:582
CUDAHashmapBufferContext kv_mgr_ctx_
Definition: HashmapCUDAImpl.h:101
int64_t bucket_count_
Definition: HashmapCUDAImpl.h:94
__device__ Pair< addr_t, bool > Erase(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const void *key_ptr)
Definition: HashmapCUDAImpl.h:373
__global__ void InsertKernelPass1(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:513
uint32_t addr_t
Definition: HashmapBuffer.h:58
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const addr_t &next_slab_ptr, const uint32_t &lane_id)
Definition: InternalNodeManager.h:76
Definition: InternalNodeManager.h:58
void * second
Definition: Traits.h:55
__global__ void InsertKernelPass0(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_keys, addr_t *output_addrs, int heap_counter_prev, int64_t count)
Kernels.
Definition: HashmapCUDAImpl.h:490
InternalNodeManagerContext node_mgr_ctx_
Definition: HashmapCUDAImpl.h:100
OPEN3D_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: Traits.h:68
__device__ addr_t * get_unit_ptr_from_list_nodes(addr_t slab_ptr, uint32_t lane_id)
Definition: HashmapCUDAImpl.h:80
__device__ bool Insert(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const void *key_ptr, addr_t iterator_addr)
Definition: HashmapCUDAImpl.h:182