Open3D (C++ API)  0.18.0
SlabHashBackendImpl.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2023 www.open3d.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 // Copyright 2019 Saman Ashkiani
9 //
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 //
14 // http://www.apache.org/licenses/LICENSE-2.0
15 //
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
19 // implied. See the License for the specific language governing permissions
20 // and limitations under the License.
21 
22 #pragma once
23 
24 #include "open3d/core/CUDAUtils.h"
30 
31 namespace open3d {
32 namespace core {
33 
34 // Each slab contains a collection of uint32_t entries.
35 // Each uint32_t entry can represent:
36 // 0) an empty placeholder;
37 // 1) a stored buf_index;
38 // 2) a ptr to the next slab if at the end of the slab.
39 // In case 0) and 1), it is interpreted as a buf_index_t.
40 // In case 2), it is interpreted as uint32_t.
41 // They are equivalent, but we differentiate them in the implementation to
42 // emphasize the differences.
43 
44 template <typename Key, typename Hash, typename Eq>
46 public:
48 
49  __host__ void Setup(int64_t init_buckets,
50  const SlabNodeManagerImpl& node_mgr_impl,
51  const CUDAHashBackendBufferAccessor& buffer_accessor);
52 
54  __device__ bool Insert(bool lane_active,
55  uint32_t lane_id,
56  uint32_t bucket_id,
57  const Key& key,
58  buf_index_t buf_index);
59 
61  __device__ Pair<buf_index_t, bool> Find(bool lane_active,
62  uint32_t lane_id,
63  uint32_t bucket_id,
64  const Key& key);
65 
67  __device__ Pair<buf_index_t, bool> Erase(bool lane_active,
68  uint32_t lane_id,
69  uint32_t bucket_id,
70  const Key& key);
71 
73  __device__ void WarpSyncKey(const Key& key, uint32_t lane_id, Key& ret_key);
74 
76  __device__ int32_t WarpFindKey(const Key& src_key,
77  uint32_t lane_id,
78  uint32_t slab_entry);
79 
81  __device__ int32_t WarpFindEmpty(uint32_t slab_entry);
82 
83  // Hash function.
84  __device__ int64_t ComputeBucket(const Key& key) const;
85 
86  // Node manager.
87  __device__ uint32_t AllocateSlab(uint32_t lane_id);
88  __device__ void FreeSlab(uint32_t slab_ptr);
89 
90  // Helpers.
91  __device__ uint32_t* SlabEntryPtr(uint32_t bucket_id,
92  uint32_t lane_id,
93  uint32_t slab_ptr) {
94  return (slab_ptr == kHeadSlabAddr)
95  ? SlabEntryPtrFromHead(bucket_id, lane_id)
96  : SlabEntryPtrFromNodes(slab_ptr, lane_id);
97  }
98 
99  __device__ uint32_t* SlabEntryPtrFromNodes(uint32_t slab_ptr,
100  uint32_t lane_id) {
101  return node_mgr_impl_.get_unit_ptr_from_slab(slab_ptr, lane_id);
102  }
103  __device__ uint32_t* SlabEntryPtrFromHead(uint32_t bucket_id,
104  uint32_t lane_id) {
105  return reinterpret_cast<uint32_t*>(bucket_list_head_) +
106  bucket_id * kWarpSize + lane_id;
107  }
108 
109 public:
110  Hash hash_fn_;
111  Eq eq_fn_;
112  int64_t bucket_count_;
113 
117 
118  // TODO: verify size with alignment
119  int key_size_in_int_ = sizeof(Key) / sizeof(int);
120 };
121 
123 template <typename Key, typename Hash, typename Eq>
125  const void* input_keys,
126  buf_index_t* output_buf_indices,
127  int heap_counter_prev,
128  int64_t count);
129 
130 template <typename Key, typename Hash, typename Eq>
132  const void* input_keys,
133  buf_index_t* output_buf_indices,
134  bool* output_masks,
135  int64_t count);
136 
137 template <typename Key, typename Hash, typename Eq, typename block_t>
139  const void* const* input_values_soa,
140  buf_index_t* output_buf_indices,
141  bool* output_masks,
142  int64_t count,
143  int64_t n_values);
144 
145 template <typename Key, typename Hash, typename Eq>
147  const void* input_keys,
148  buf_index_t* output_buf_indices,
149  bool* output_masks,
150  int64_t count);
151 
152 template <typename Key, typename Hash, typename Eq>
154  const void* input_keys,
155  buf_index_t* output_buf_indices,
156  bool* output_masks,
157  int64_t count);
158 
159 template <typename Key, typename Hash, typename Eq>
161  buf_index_t* output_buf_indices,
162  bool* output_masks,
163  int64_t count);
164 
165 template <typename Key, typename Hash, typename Eq>
167  buf_index_t* output_buf_indices,
168  uint32_t* output_count);
169 
170 template <typename Key, typename Hash, typename Eq>
171 __global__ void CountElemsPerBucketKernel(
172  SlabHashBackendImpl<Key, Hash, Eq> impl, int64_t* bucket_elem_counts);
173 
174 template <typename Key, typename Hash, typename Eq>
176  : bucket_count_(0), bucket_list_head_(nullptr) {}
177 
178 template <typename Key, typename Hash, typename Eq>
180  int64_t init_buckets,
181  const SlabNodeManagerImpl& allocator_impl,
182  const CUDAHashBackendBufferAccessor& buffer_accessor) {
183  bucket_count_ = init_buckets;
184  node_mgr_impl_ = allocator_impl;
185  buffer_accessor_ = buffer_accessor;
186 }
187 
188 template <typename Key, typename Hash, typename Eq>
190  bool lane_active,
191  uint32_t lane_id,
192  uint32_t bucket_id,
193  const Key& key,
194  buf_index_t buf_index) {
195  uint32_t work_queue = 0;
196  uint32_t prev_work_queue = 0;
197  uint32_t slab_ptr = kHeadSlabAddr;
198  Key src_key;
199 
200  bool mask = false;
201 
202  // > Loop when we have active lanes
203  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
204  // 0. Restart from linked list head if last insertion is finished
205  slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
206  uint32_t src_lane = __ffs(work_queue) - 1;
207  uint32_t src_bucket =
208  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
209  WarpSyncKey(key, src_lane, src_key);
210 
211  uint32_t slab_entry = *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
212 
213  int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
214  int32_t lane_empty = WarpFindEmpty(slab_entry);
215 
216  // Branch 1: key already existing, ABORT
217  if (lane_found >= 0) {
218  if (lane_id == src_lane) {
219  lane_active = false;
220  }
221  }
222 
223  // Branch 2: empty slot available, try to insert
224  else if (lane_empty >= 0) {
225  // Cannot merge if statements.
226  // otherwise the warp flow will be interrupted.
227  if (lane_id == src_lane) {
228  // Now regard the entry as a value of buf_index
229  const uint32_t* empty_entry_ptr =
230  SlabEntryPtr(src_bucket, lane_empty, slab_ptr);
231 
232  uint32_t old_empty_entry_value =
233  atomicCAS((unsigned int*)empty_entry_ptr,
234  kEmptyNodeAddr, buf_index);
235 
236  // Branch 2.1: SUCCEED
237  if (old_empty_entry_value == kEmptyNodeAddr) {
238  lane_active = false;
239  mask = true;
240  }
241  // Branch 2.2: failed: RESTART
242  // In the consequent attempt,
243  // > if the same key was inserted in this slot,
244  // we fall back to Branch 1;
245  // > if a different key was inserted,
246  // we go to Branch 2 or 3.
247  }
248  }
249 
250  // Branch 3: nothing found in this slab, goto next slab
251  else {
252  // broadcast next slab
253  uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
254  kNextSlabPtrLaneId, kWarpSize);
255 
256  // Branch 3.1: next slab existing, RESTART at updated slab ptr
257  if (next_slab_ptr != kEmptySlabAddr) {
258  slab_ptr = next_slab_ptr;
259  }
260 
261  // Branch 3.2: next slab empty, try to allocate one from the Slab
262  // buffer.
263  else {
264  // Warp allocate, must be outside the condition clause.
265  uint32_t new_next_slab_ptr = AllocateSlab(lane_id);
266 
267  if (lane_id == kNextSlabPtrLaneId) {
268  const uint32_t* next_slab_entry_ptr = SlabEntryPtr(
269  src_bucket, kNextSlabPtrLaneId, slab_ptr);
270 
271  uint32_t old_next_slab_entry_value =
272  atomicCAS((unsigned int*)next_slab_entry_ptr,
273  kEmptySlabAddr, new_next_slab_ptr);
274 
275  // Branch 3.2.1: other thread has allocated,
276  // RESTART. In the consequent attempt, goto Branch 2.
277  if (old_next_slab_entry_value != kEmptySlabAddr) {
278  FreeSlab(new_next_slab_ptr);
279  }
280 
281  // Branch 3.2.2: this thread allocated successfully.
282  // RESTART, goto Branch 2
283  }
284  }
285  }
286 
287  prev_work_queue = work_queue;
288  }
289 
290  return mask;
291 }
292 
293 template <typename Key, typename Hash, typename Eq>
295  bool lane_active,
296  uint32_t lane_id,
297  uint32_t bucket_id,
298  const Key& query_key) {
299  uint32_t work_queue = 0;
300  uint32_t prev_work_queue = work_queue;
301  uint32_t slab_ptr = kHeadSlabAddr;
302 
303  buf_index_t buf_index = kNullAddr;
304  bool mask = false;
305 
306  // > Loop when we have active lanes.
307  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
308  // 0. Restart from linked list head if the last query is finished.
309  slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
310  uint32_t src_lane = __ffs(work_queue) - 1;
311  uint32_t src_bucket =
312  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
313 
314  Key src_key;
315  WarpSyncKey(query_key, src_lane, src_key);
316 
317  // Each lane in the warp reads a unit in the slab in parallel.
318  const uint32_t slab_entry =
319  *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
320 
321  int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
322 
323  // 1. Found in this slab, SUCCEED.
324  if (lane_found >= 0) {
325  // broadcast found value
326  uint32_t found_buf_index = __shfl_sync(kSyncLanesMask, slab_entry,
327  lane_found, kWarpSize);
328 
329  if (lane_id == src_lane) {
330  lane_active = false;
331  buf_index = found_buf_index;
332  mask = true;
333  }
334  }
335 
336  // 2. Not found in this slab.
337  else {
338  // Broadcast next slab: lane 31 reads 'next'.
339  uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
340  kNextSlabPtrLaneId, kWarpSize);
341 
342  // 2.1. Next slab is empty, ABORT.
343  if (next_slab_ptr == kEmptySlabAddr) {
344  if (lane_id == src_lane) {
345  lane_active = false;
346  }
347  }
348  // 2.2. Next slab exists, RESTART.
349  else {
350  slab_ptr = next_slab_ptr;
351  }
352  }
353 
354  prev_work_queue = work_queue;
355  }
356 
357  return make_pair(buf_index, mask);
358 }
359 
360 template <typename Key, typename Hash, typename Eq>
362  bool lane_active,
363  uint32_t lane_id,
364  uint32_t bucket_id,
365  const Key& key) {
366  uint32_t work_queue = 0;
367  uint32_t prev_work_queue = 0;
368  uint32_t slab_ptr = kHeadSlabAddr;
369  Key src_key;
370 
371  buf_index_t buf_index = 0;
372  bool mask = false;
373 
374  // > Loop when we have active lanes.
375  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
376  // 0. Restart from linked list head if last insertion is finished.
377  slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
378  uint32_t src_lane = __ffs(work_queue) - 1;
379  uint32_t src_bucket =
380  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
381 
382  WarpSyncKey(key, src_lane, src_key);
383 
384  const uint32_t slab_entry =
385  *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
386 
387  int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
388 
389  // Branch 1: key found.
390  if (lane_found >= 0) {
391  if (lane_id == src_lane) {
392  uint32_t* found_entry_ptr =
393  SlabEntryPtr(src_bucket, lane_found, slab_ptr);
394 
395  uint32_t old_found_entry_value = atomicExch(
396  (unsigned int*)found_entry_ptr, kEmptyNodeAddr);
397 
398  // Branch 1.2: other thread might have done the job,
399  // avoid double free.
400  mask = (old_found_entry_value != kEmptyNodeAddr);
401  buf_index = old_found_entry_value;
402  }
403  } else { // no matching slot found:
404  uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
405  kNextSlabPtrLaneId, kWarpSize);
406  if (next_slab_ptr == kEmptySlabAddr) {
407  // not found:
408  if (lane_id == src_lane) {
409  lane_active = false;
410  }
411  } else {
412  slab_ptr = next_slab_ptr;
413  }
414  }
415  prev_work_queue = work_queue;
416  }
417 
418  return make_pair(buf_index, mask);
419 }
420 
421 template <typename Key, typename Hash, typename Eq>
423  const Key& key, uint32_t lane_id, Key& ret_key) {
424  auto dst_key_ptr = reinterpret_cast<int*>(&ret_key);
425  auto src_key_ptr = reinterpret_cast<const int*>(&key);
426  for (int i = 0; i < key_size_in_int_; ++i) {
427  dst_key_ptr[i] =
428  __shfl_sync(kSyncLanesMask, src_key_ptr[i], lane_id, kWarpSize);
429  }
430 }
431 
432 template <typename Key, typename Hash, typename Eq>
434  const Key& key, uint32_t lane_id, uint32_t slab_entry) {
435  bool is_lane_found =
436  // Select key lanes.
437  ((1 << lane_id) & kNodePtrLanesMask)
438  // Validate key buf_indices.
439  && (slab_entry != kEmptyNodeAddr)
440  // Find keys in buffer. Now slab_entry is interpreted as buf_index.
441  &&
442  eq_fn_(*static_cast<Key*>(buffer_accessor_.GetKeyPtr(slab_entry)),
443  key);
444 
445  return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_found)) - 1;
446 }
447 
448 template <typename Key, typename Hash, typename Eq>
449 __device__ int32_t
451  bool is_lane_empty = (slab_entry == kEmptyNodeAddr);
452  return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_empty)) - 1;
453 }
454 
455 template <typename Key, typename Hash, typename Eq>
456 __device__ int64_t
458  return hash_fn_(key) % bucket_count_;
459 }
460 
461 template <typename Key, typename Hash, typename Eq>
462 __device__ uint32_t
464  return node_mgr_impl_.WarpAllocate(lane_id);
465 }
466 
467 template <typename Key, typename Hash, typename Eq>
468 __device__ __forceinline__ void SlabHashBackendImpl<Key, Hash, Eq>::FreeSlab(
469  uint32_t slab_ptr) {
470  node_mgr_impl_.FreeUntouched(slab_ptr);
471 }
472 
473 template <typename Key, typename Hash, typename Eq>
475  const void* input_keys,
476  buf_index_t* output_buf_indices,
477  int heap_counter_prev,
478  int64_t count) {
479  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
480  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
481 
482  if (tid < count) {
483  // First write ALL input_keys to avoid potential thread conflicts.
484  buf_index_t buf_index =
485  impl.buffer_accessor_.heap_[heap_counter_prev + tid];
486  void* key = impl.buffer_accessor_.GetKeyPtr(buf_index);
487  *static_cast<Key*>(key) = input_keys_templated[tid];
488  output_buf_indices[tid] = buf_index;
489  }
490 }
491 
492 template <typename Key, typename Hash, typename Eq>
494  const void* input_keys,
495  buf_index_t* output_buf_indices,
496  bool* output_masks,
497  int64_t count) {
498  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
499  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
500  uint32_t lane_id = tid & 0x1F;
501 
502  if (tid - lane_id >= count) {
503  return;
504  }
505 
506  impl.node_mgr_impl_.Init(tid, lane_id);
507 
508  bool lane_active = false;
509  uint32_t bucket_id = 0;
510  buf_index_t buf_index = 0;
511 
512  // Dummy for warp sync.
513  Key key;
514  if (tid < count) {
515  lane_active = true;
516  key = input_keys_templated[tid];
517  buf_index = output_buf_indices[tid];
518  bucket_id = impl.ComputeBucket(key);
519  }
520 
521  // Index out-of-bound threads still have to run for warp synchronization.
522  bool mask = impl.Insert(lane_active, lane_id, bucket_id, key, buf_index);
523 
524  if (tid < count) {
525  output_masks[tid] = mask;
526  }
527 }
528 
529 template <typename Key, typename Hash, typename Eq, typename block_t>
531  const void* const* input_values_soa,
532  buf_index_t* output_buf_indices,
533  bool* output_masks,
534  int64_t count,
535  int64_t n_values) {
536  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
537 
538  if (tid < count) {
539  buf_index_t buf_index = output_buf_indices[tid];
540 
541  if (output_masks[tid]) {
542  for (int j = 0; j < n_values; ++j) {
543  int64_t blocks_per_element =
545 
546  block_t* dst_value = static_cast<block_t*>(
547  impl.buffer_accessor_.GetValuePtr(buf_index, j));
548  const block_t* src_value =
549  static_cast<const block_t*>(input_values_soa[j]) +
550  blocks_per_element * tid;
551  for (int b = 0; b < blocks_per_element; ++b) {
552  dst_value[b] = src_value[b];
553  }
554  }
555  } else {
556  impl.buffer_accessor_.DeviceFree(buf_index);
557  }
558  }
559 }
560 
561 template <typename Key, typename Hash, typename Eq>
563  const void* input_keys,
564  buf_index_t* output_buf_indices,
565  bool* output_masks,
566  int64_t count) {
567  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
568  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
569  uint32_t lane_id = threadIdx.x & 0x1F;
570 
571  // This warp is idle.
572  if ((tid - lane_id) >= count) {
573  return;
574  }
575 
576  // Initialize the memory allocator on each warp.
577  impl.node_mgr_impl_.Init(tid, lane_id);
578 
579  bool lane_active = false;
580  uint32_t bucket_id = 0;
581 
582  // Dummy for warp sync
583  Key key;
585 
586  if (tid < count) {
587  lane_active = true;
588  key = input_keys_templated[tid];
589  bucket_id = impl.ComputeBucket(key);
590  }
591 
592  result = impl.Find(lane_active, lane_id, bucket_id, key);
593 
594  if (tid < count) {
595  output_buf_indices[tid] = result.first;
596  output_masks[tid] = result.second;
597  }
598 }
599 
600 template <typename Key, typename Hash, typename Eq>
602  const void* input_keys,
603  buf_index_t* output_buf_indices,
604  bool* output_masks,
605  int64_t count) {
606  const Key* input_keys_templated = static_cast<const Key*>(input_keys);
607  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
608  uint32_t lane_id = threadIdx.x & 0x1F;
609 
610  if (tid - lane_id >= count) {
611  return;
612  }
613 
614  impl.node_mgr_impl_.Init(tid, lane_id);
615 
616  bool lane_active = false;
617  uint32_t bucket_id = 0;
618 
619  // Dummy for warp sync
620  Key key;
621  if (tid < count) {
622  lane_active = true;
623  key = input_keys_templated[tid];
624  bucket_id = impl.ComputeBucket(key);
625  }
626 
627  auto result = impl.Erase(lane_active, lane_id, bucket_id, key);
628 
629  if (tid < count) {
630  output_buf_indices[tid] = result.first;
631  output_masks[tid] = result.second;
632  }
633 }
634 
635 template <typename Key, typename Hash, typename Eq>
637  buf_index_t* output_buf_indices,
638  bool* output_masks,
639  int64_t count) {
640  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
641  if (tid < count && output_masks[tid]) {
642  impl.buffer_accessor_.DeviceFree(output_buf_indices[tid]);
643  }
644 }
645 
646 template <typename Key, typename Hash, typename Eq>
648  buf_index_t* output_buf_indices,
649  uint32_t* output_count) {
650  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
651  uint32_t lane_id = threadIdx.x & 0x1F;
652 
653  // Assigning a warp per bucket.
654  uint32_t bucket_id = tid >> 5;
655  if (bucket_id >= impl.bucket_count_) {
656  return;
657  }
658 
659  impl.node_mgr_impl_.Init(tid, lane_id);
660 
661  uint32_t slab_entry = *impl.SlabEntryPtrFromHead(bucket_id, lane_id);
662  bool is_active = slab_entry != kEmptyNodeAddr;
663 
664  if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
665  uint32_t index = atomicAdd(output_count, 1);
666  output_buf_indices[index] = slab_entry;
667  }
668 
669  uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
670  kNextSlabPtrLaneId, kWarpSize);
671 
672  // Count following nodes,
673  while (slab_ptr != kEmptySlabAddr) {
674  slab_entry = *impl.SlabEntryPtrFromNodes(slab_ptr, lane_id);
675  is_active = (slab_entry != kEmptyNodeAddr);
676 
677  if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
678  uint32_t index = atomicAdd(output_count, 1);
679  output_buf_indices[index] = slab_entry;
680  }
681  slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
682  kWarpSize);
683  }
684 }
685 
686 template <typename Key, typename Hash, typename Eq>
688  SlabHashBackendImpl<Key, Hash, Eq> impl, int64_t* bucket_elem_counts) {
689  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
690  uint32_t lane_id = threadIdx.x & 0x1F;
691 
692  // Assigning a warp per bucket.
693  uint32_t bucket_id = tid >> 5;
694  if (bucket_id >= impl.bucket_count_) {
695  return;
696  }
697 
698  impl.node_mgr_impl_.Init(tid, lane_id);
699 
700  uint32_t count = 0;
701 
702  // Count head node.
703  uint32_t slab_entry = *impl.SlabEntryPtrFromHead(bucket_id, lane_id);
704  count += __popc(
705  __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
706  uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
707  kNextSlabPtrLaneId, kWarpSize);
708 
709  // Count following nodes.
710  while (slab_ptr != kEmptySlabAddr) {
711  slab_entry = *impl.SlabEntryPtrFromNodes(slab_ptr, lane_id);
712  count += __popc(
713  __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
714  slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
715  kWarpSize);
716  }
717 
718  // Write back the results.
719  if (lane_id == 0) {
720  bucket_elem_counts[bucket_id] = count;
721  }
722 }
723 
724 } // namespace core
725 } // namespace open3d
Common CUDA utilities.
core::Tensor result
Definition: VtkUtils.cpp:75
Definition: CUDAHashBackendBufferAccessor.h:24
int64_t * value_blocks_per_element_
Definition: CUDAHashBackendBufferAccessor.h:108
__device__ void DeviceFree(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:83
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:88
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition: CUDAHashBackendBufferAccessor.h:91
buf_index_t * heap_
Definition: CUDAHashBackendBufferAccessor.h:96
Definition: SlabHashBackendImpl.h:45
__device__ uint32_t * SlabEntryPtrFromNodes(uint32_t slab_ptr, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:99
__device__ Pair< buf_index_t, bool > Erase(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-erase an entry at key.
Definition: SlabHashBackendImpl.h:361
__device__ Pair< buf_index_t, bool > Find(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-find a buf_index and its mask at key.
Definition: SlabHashBackendImpl.h:294
__device__ bool Insert(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key, buf_index_t buf_index)
Warp-insert a pre-allocated buf_index at key.
Definition: SlabHashBackendImpl.h:189
__device__ uint32_t AllocateSlab(uint32_t lane_id)
Definition: SlabHashBackendImpl.h:463
__device__ void FreeSlab(uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:468
__host__ void Setup(int64_t init_buckets, const SlabNodeManagerImpl &node_mgr_impl, const CUDAHashBackendBufferAccessor &buffer_accessor)
Definition: SlabHashBackendImpl.h:179
int64_t bucket_count_
Definition: SlabHashBackendImpl.h:112
Slab * bucket_list_head_
Definition: SlabHashBackendImpl.h:114
__device__ uint32_t * SlabEntryPtr(uint32_t bucket_id, uint32_t lane_id, uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:91
Hash hash_fn_
Definition: SlabHashBackendImpl.h:110
__device__ void WarpSyncKey(const Key &key, uint32_t lane_id, Key &ret_key)
Warp-synchronize a key in a slab.
Definition: SlabHashBackendImpl.h:422
__device__ int32_t WarpFindKey(const Key &src_key, uint32_t lane_id, uint32_t slab_entry)
Warp-find a key in a slab.
Definition: SlabHashBackendImpl.h:433
__device__ int64_t ComputeBucket(const Key &key) const
Definition: SlabHashBackendImpl.h:457
SlabHashBackendImpl()
Definition: SlabHashBackendImpl.h:175
int key_size_in_int_
Definition: SlabHashBackendImpl.h:119
__device__ int32_t WarpFindEmpty(uint32_t slab_entry)
Warp-find the first empty slot in a slab.
Definition: SlabHashBackendImpl.h:450
Eq eq_fn_
Definition: SlabHashBackendImpl.h:111
SlabNodeManagerImpl node_mgr_impl_
Definition: SlabHashBackendImpl.h:115
__device__ uint32_t * SlabEntryPtrFromHead(uint32_t bucket_id, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:103
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: SlabHashBackendImpl.h:116
Definition: SlabNodeManager.h:39
Definition: SlabNodeManager.h:48
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:57
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: SlabNodeManager.h:68
__device__ void FreeUntouched(buf_index_t ptr)
Definition: SlabNodeManager.h:134
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:78
int count
Definition: FilePCD.cpp:42
__global__ void InsertKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:493
__global__ void InsertKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, int heap_counter_prev, int64_t count)
Kernels.
Definition: SlabHashBackendImpl.h:474
uint32_t buf_index_t
Definition: HashBackendBuffer.h:44
__global__ void EraseKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:636
__global__ void FindKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:562
__global__ void EraseKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:601
__global__ void GetActiveIndicesKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, uint32_t *output_count)
Definition: SlabHashBackendImpl.h:647
__global__ void CountElemsPerBucketKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, int64_t *bucket_elem_counts)
Definition: SlabHashBackendImpl.h:687
OPEN3D_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: SlabTraits.h:49
__global__ void InsertKernelPass2(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: SlabHashBackendImpl.h:530
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:548
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:395
Definition: PinholeCameraIntrinsic.cpp:16
Definition: SlabTraits.h:40