44 template <
typename Key,
typename Hash,
typename Eq>
49 __host__
void Setup(int64_t init_buckets,
54 __device__
bool Insert(
bool lane_active,
94 return (slab_ptr == kHeadSlabAddr)
106 bucket_id * kWarpSize + lane_id;
123 template <
typename Key,
typename Hash,
typename Eq>
125 const void* input_keys,
127 int heap_counter_prev,
130 template <
typename Key,
typename Hash,
typename Eq>
132 const void* input_keys,
137 template <
typename Key,
typename Hash,
typename Eq,
typename block_t>
139 const void*
const* input_values_soa,
145 template <
typename Key,
typename Hash,
typename Eq>
147 const void* input_keys,
152 template <
typename Key,
typename Hash,
typename Eq>
154 const void* input_keys,
159 template <
typename Key,
typename Hash,
typename Eq>
165 template <
typename Key,
typename Hash,
typename Eq>
170 template <
typename Key,
typename Hash,
typename Eq>
174 template <
typename Key,
typename Hash,
typename Eq>
176 : bucket_count_(0), bucket_list_head_(nullptr) {}
178 template <
typename Key,
typename Hash,
typename Eq>
180 int64_t init_buckets,
183 bucket_count_ = init_buckets;
184 node_mgr_impl_ = allocator_impl;
185 buffer_accessor_ = buffer_accessor;
188 template <
typename Key,
typename Hash,
typename Eq>
203 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
205 slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
206 uint32_t src_lane = __ffs(work_queue) - 1;
208 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
209 WarpSyncKey(key, src_lane, src_key);
211 uint32_t slab_entry = *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
213 int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
214 int32_t lane_empty = WarpFindEmpty(slab_entry);
217 if (lane_found >= 0) {
218 if (lane_id == src_lane) {
224 else if (lane_empty >= 0) {
227 if (lane_id == src_lane) {
230 SlabEntryPtr(src_bucket, lane_empty, slab_ptr);
233 atomicCAS((
unsigned int*)empty_entry_ptr,
234 kEmptyNodeAddr, buf_index);
237 if (old_empty_entry_value == kEmptyNodeAddr) {
253 uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
254 kNextSlabPtrLaneId, kWarpSize);
257 if (next_slab_ptr != kEmptySlabAddr) {
258 slab_ptr = next_slab_ptr;
265 uint32_t new_next_slab_ptr = AllocateSlab(lane_id);
267 if (lane_id == kNextSlabPtrLaneId) {
268 const uint32_t* next_slab_entry_ptr = SlabEntryPtr(
269 src_bucket, kNextSlabPtrLaneId, slab_ptr);
271 uint32_t old_next_slab_entry_value =
272 atomicCAS((
unsigned int*)next_slab_entry_ptr,
273 kEmptySlabAddr, new_next_slab_ptr);
277 if (old_next_slab_entry_value != kEmptySlabAddr) {
278 FreeSlab(new_next_slab_ptr);
287 prev_work_queue = work_queue;
293 template <
typename Key,
typename Hash,
typename Eq>
298 const Key& query_key) {
300 uint32_t prev_work_queue = work_queue;
307 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
309 slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
310 uint32_t src_lane = __ffs(work_queue) - 1;
312 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
315 WarpSyncKey(query_key, src_lane, src_key);
319 *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
321 int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
324 if (lane_found >= 0) {
326 uint32_t found_buf_index = __shfl_sync(kSyncLanesMask, slab_entry,
327 lane_found, kWarpSize);
329 if (lane_id == src_lane) {
331 buf_index = found_buf_index;
339 uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
340 kNextSlabPtrLaneId, kWarpSize);
343 if (next_slab_ptr == kEmptySlabAddr) {
344 if (lane_id == src_lane) {
350 slab_ptr = next_slab_ptr;
354 prev_work_queue = work_queue;
360 template <
typename Key,
typename Hash,
typename Eq>
375 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
377 slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
378 uint32_t src_lane = __ffs(work_queue) - 1;
380 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
382 WarpSyncKey(key, src_lane, src_key);
385 *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
387 int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
390 if (lane_found >= 0) {
391 if (lane_id == src_lane) {
393 SlabEntryPtr(src_bucket, lane_found, slab_ptr);
395 uint32_t old_found_entry_value = atomicExch(
396 (
unsigned int*)found_entry_ptr, kEmptyNodeAddr);
400 mask = (old_found_entry_value != kEmptyNodeAddr);
401 buf_index = old_found_entry_value;
404 uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
405 kNextSlabPtrLaneId, kWarpSize);
406 if (next_slab_ptr == kEmptySlabAddr) {
408 if (lane_id == src_lane) {
412 slab_ptr = next_slab_ptr;
415 prev_work_queue = work_queue;
421 template <
typename Key,
typename Hash,
typename Eq>
423 const Key& key,
uint32_t lane_id, Key& ret_key) {
424 auto dst_key_ptr =
reinterpret_cast<int*
>(&ret_key);
425 auto src_key_ptr =
reinterpret_cast<const int*
>(&key);
426 for (
int i = 0; i < key_size_in_int_; ++i) {
428 __shfl_sync(kSyncLanesMask, src_key_ptr[i], lane_id, kWarpSize);
432 template <
typename Key,
typename Hash,
typename Eq>
437 ((1 << lane_id) & kNodePtrLanesMask)
439 && (slab_entry != kEmptyNodeAddr)
442 eq_fn_(*
static_cast<Key*
>(buffer_accessor_.
GetKeyPtr(slab_entry)),
445 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_found)) - 1;
448 template <
typename Key,
typename Hash,
typename Eq>
451 bool is_lane_empty = (slab_entry == kEmptyNodeAddr);
452 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_empty)) - 1;
455 template <
typename Key,
typename Hash,
typename Eq>
458 return hash_fn_(key) % bucket_count_;
461 template <
typename Key,
typename Hash,
typename Eq>
467 template <
typename Key,
typename Hash,
typename Eq>
473 template <
typename Key,
typename Hash,
typename Eq>
475 const void* input_keys,
477 int heap_counter_prev,
479 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
480 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
487 *
static_cast<Key*
>(key) = input_keys_templated[tid];
488 output_buf_indices[tid] = buf_index;
492 template <
typename Key,
typename Hash,
typename Eq>
494 const void* input_keys,
498 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
499 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
502 if (tid - lane_id >=
count) {
508 bool lane_active =
false;
516 key = input_keys_templated[tid];
517 buf_index = output_buf_indices[tid];
522 bool mask = impl.
Insert(lane_active, lane_id, bucket_id, key, buf_index);
525 output_masks[tid] = mask;
529 template <
typename Key,
typename Hash,
typename Eq,
typename block_t>
531 const void*
const* input_values_soa,
536 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
541 if (output_masks[tid]) {
542 for (
int j = 0; j < n_values; ++j) {
543 int64_t blocks_per_element =
546 block_t* dst_value =
static_cast<block_t*
>(
548 const block_t* src_value =
549 static_cast<const block_t*
>(input_values_soa[j]) +
550 blocks_per_element * tid;
551 for (
int b = 0; b < blocks_per_element; ++b) {
552 dst_value[b] = src_value[b];
561 template <
typename Key,
typename Hash,
typename Eq>
563 const void* input_keys,
567 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
568 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
569 uint32_t lane_id = threadIdx.x & 0x1F;
572 if ((tid - lane_id) >=
count) {
579 bool lane_active =
false;
588 key = input_keys_templated[tid];
592 result = impl.
Find(lane_active, lane_id, bucket_id, key);
595 output_buf_indices[tid] =
result.first;
596 output_masks[tid] =
result.second;
600 template <
typename Key,
typename Hash,
typename Eq>
602 const void* input_keys,
606 const Key* input_keys_templated =
static_cast<const Key*
>(input_keys);
607 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
608 uint32_t lane_id = threadIdx.x & 0x1F;
610 if (tid - lane_id >=
count) {
616 bool lane_active =
false;
623 key = input_keys_templated[tid];
627 auto result = impl.
Erase(lane_active, lane_id, bucket_id, key);
630 output_buf_indices[tid] =
result.first;
631 output_masks[tid] =
result.second;
635 template <
typename Key,
typename Hash,
typename Eq>
640 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
641 if (tid <
count && output_masks[tid]) {
646 template <
typename Key,
typename Hash,
typename Eq>
650 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
651 uint32_t lane_id = threadIdx.x & 0x1F;
662 bool is_active = slab_entry != kEmptyNodeAddr;
664 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
665 uint32_t index = atomicAdd(output_count, 1);
666 output_buf_indices[index] = slab_entry;
669 uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
670 kNextSlabPtrLaneId, kWarpSize);
673 while (slab_ptr != kEmptySlabAddr) {
675 is_active = (slab_entry != kEmptyNodeAddr);
677 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
678 uint32_t index = atomicAdd(output_count, 1);
679 output_buf_indices[index] = slab_entry;
681 slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
686 template <
typename Key,
typename Hash,
typename Eq>
689 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
690 uint32_t lane_id = threadIdx.x & 0x1F;
705 __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
706 uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
707 kNextSlabPtrLaneId, kWarpSize);
710 while (slab_ptr != kEmptySlabAddr) {
713 __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
714 slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
720 bucket_elem_counts[bucket_id] =
count;
core::Tensor result
Definition: VtkUtils.cpp:75
Definition: CUDAHashBackendBufferAccessor.h:24
int64_t * value_blocks_per_element_
Definition: CUDAHashBackendBufferAccessor.h:108
__device__ void DeviceFree(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:83
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:88
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition: CUDAHashBackendBufferAccessor.h:91
buf_index_t * heap_
Definition: CUDAHashBackendBufferAccessor.h:96
Definition: SlabHashBackendImpl.h:45
__device__ uint32_t * SlabEntryPtrFromNodes(uint32_t slab_ptr, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:99
__device__ Pair< buf_index_t, bool > Erase(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-erase an entry at key.
Definition: SlabHashBackendImpl.h:361
__device__ Pair< buf_index_t, bool > Find(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-find a buf_index and its mask at key.
Definition: SlabHashBackendImpl.h:294
__device__ bool Insert(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key, buf_index_t buf_index)
Warp-insert a pre-allocated buf_index at key.
Definition: SlabHashBackendImpl.h:189
__device__ uint32_t AllocateSlab(uint32_t lane_id)
Definition: SlabHashBackendImpl.h:463
__device__ void FreeSlab(uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:468
__host__ void Setup(int64_t init_buckets, const SlabNodeManagerImpl &node_mgr_impl, const CUDAHashBackendBufferAccessor &buffer_accessor)
Definition: SlabHashBackendImpl.h:179
int64_t bucket_count_
Definition: SlabHashBackendImpl.h:112
Slab * bucket_list_head_
Definition: SlabHashBackendImpl.h:114
__device__ uint32_t * SlabEntryPtr(uint32_t bucket_id, uint32_t lane_id, uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:91
Hash hash_fn_
Definition: SlabHashBackendImpl.h:110
__device__ void WarpSyncKey(const Key &key, uint32_t lane_id, Key &ret_key)
Warp-synchronize a key in a slab.
Definition: SlabHashBackendImpl.h:422
__device__ int32_t WarpFindKey(const Key &src_key, uint32_t lane_id, uint32_t slab_entry)
Warp-find a key in a slab.
Definition: SlabHashBackendImpl.h:433
__device__ int64_t ComputeBucket(const Key &key) const
Definition: SlabHashBackendImpl.h:457
SlabHashBackendImpl()
Definition: SlabHashBackendImpl.h:175
int key_size_in_int_
Definition: SlabHashBackendImpl.h:119
__device__ int32_t WarpFindEmpty(uint32_t slab_entry)
Warp-find the first empty slot in a slab.
Definition: SlabHashBackendImpl.h:450
Eq eq_fn_
Definition: SlabHashBackendImpl.h:111
SlabNodeManagerImpl node_mgr_impl_
Definition: SlabHashBackendImpl.h:115
__device__ uint32_t * SlabEntryPtrFromHead(uint32_t bucket_id, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:103
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: SlabHashBackendImpl.h:116
Definition: SlabNodeManager.h:39
Definition: SlabNodeManager.h:48
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:57
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: SlabNodeManager.h:68
__device__ void FreeUntouched(buf_index_t ptr)
Definition: SlabNodeManager.h:134
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:78
__global__ void InsertKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:493
__global__ void InsertKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, int heap_counter_prev, int64_t count)
Kernels.
Definition: SlabHashBackendImpl.h:474
uint32_t buf_index_t
Definition: HashBackendBuffer.h:44
__global__ void EraseKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:636
__global__ void FindKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:562
__global__ void EraseKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:601
__global__ void GetActiveIndicesKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, uint32_t *output_count)
Definition: SlabHashBackendImpl.h:647
__global__ void CountElemsPerBucketKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, int64_t *bucket_elem_counts)
Definition: SlabHashBackendImpl.h:687
OPEN3D_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: SlabTraits.h:49
__global__ void InsertKernelPass2(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: SlabHashBackendImpl.h:530
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:548
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:395
Definition: PinholeCameraIntrinsic.cpp:16
Definition: SlabTraits.h:40