Open3D (C++ API)  0.12.0
HashmapCUDAImpl.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
32 
33 namespace open3d {
34 namespace core {
35 
36 template <typename Hash, typename KeyEq>
38 public:
40 
41  __host__ void Setup(int64_t init_buckets,
42  int64_t init_capacity,
43  int64_t dsize_key,
44  int64_t dsize_value,
45  const InternalNodeManagerContext& node_mgr_ctx,
46  const CUDAHashmapBufferContext& kv_mgr_ctx);
47 
48  __device__ bool Insert(bool lane_active,
49  uint32_t lane_id,
50  uint32_t bucket_id,
51  const void* key_ptr,
52  addr_t iterator_addr);
53 
54  __device__ Pair<addr_t, bool> Find(bool lane_active,
55  uint32_t lane_id,
56  uint32_t bucket_id,
57  const void* key_ptr);
58 
59  __device__ Pair<addr_t, bool> Erase(bool lane_active,
60  uint32_t lane_id,
61  uint32_t bucket_id,
62  const void* key_ptr);
63 
64  __device__ void WarpSyncKey(const void* key_ptr,
65  uint32_t lane_id,
66  void* ret_key_ptr);
67  __device__ int32_t WarpFindKey(const void* src_key_ptr,
68  uint32_t lane_id,
69  addr_t ptr);
70  __device__ int32_t WarpFindEmpty(addr_t unit_data);
71 
72  // Hash function.
73  __device__ int64_t ComputeBucket(const void* key_ptr) const;
74 
75  // Node manager.
76  __device__ addr_t AllocateSlab(uint32_t lane_id);
77  __device__ void FreeSlab(addr_t slab_ptr);
78 
79  // Helpers.
81  uint32_t lane_id) {
82  return node_mgr_ctx_.get_unit_ptr_from_slab(slab_ptr, lane_id);
83  }
85  uint32_t lane_id) {
86  return reinterpret_cast<uint32_t*>(bucket_list_head_) +
87  bucket_id * kWarpSize + lane_id;
88  }
89 
90 public:
91  Hash hash_fn_;
92  KeyEq cmp_fn_;
93 
94  int64_t bucket_count_;
95  int64_t capacity_;
96  int64_t dsize_key_;
97  int64_t dsize_value_;
98 
101  CUDAHashmapBufferContext kv_mgr_ctx_;
102 };
103 
105 template <typename Hash, typename KeyEq>
107  const void* input_keys,
108  addr_t* output_addrs,
109  int heap_counter_prev,
110  int64_t count);
111 
112 template <typename Hash, typename KeyEq>
114  const void* input_keys,
115  addr_t* output_addrs,
116  bool* output_masks,
117  int64_t count);
118 
119 template <typename Hash, typename KeyEq>
121  const void* input_values,
122  addr_t* output_addrs,
123  bool* output_masks,
124  int64_t count);
125 
126 template <typename Hash, typename KeyEq>
127 __global__ void FindKernel(CUDAHashmapImplContext<Hash, KeyEq> hash_ctx,
128  const void* input_keys,
129  addr_t* output_addrs,
130  bool* output_masks,
131  int64_t count);
132 
133 template <typename Hash, typename KeyEq>
135  const void* input_keys,
136  addr_t* output_addrs,
137  bool* output_masks,
138  int64_t count);
139 
140 template <typename Hash, typename KeyEq>
142  addr_t* output_addrs,
143  bool* output_masks,
144  int64_t count);
145 
146 template <typename Hash, typename KeyEq>
147 __global__ void GetActiveIndicesKernel(
149  addr_t* output_addrs,
150  uint32_t* output_iterator_count);
151 
152 template <typename Hash, typename KeyEq>
153 __global__ void CountElemsPerBucketKernel(
155  int64_t* bucket_elem_counts);
156 
157 template <typename Hash, typename KeyEq>
159  : bucket_count_(0), bucket_list_head_(nullptr) {}
160 
161 template <typename Hash, typename KeyEq>
163  int64_t init_buckets,
164  int64_t init_capacity,
165  int64_t dsize_key,
166  int64_t dsize_value,
167  const InternalNodeManagerContext& allocator_ctx,
168  const CUDAHashmapBufferContext& pair_allocator_ctx) {
169  bucket_count_ = init_buckets;
170  capacity_ = init_capacity;
171  dsize_key_ = dsize_key;
172  dsize_value_ = dsize_value;
173 
174  node_mgr_ctx_ = allocator_ctx;
175  kv_mgr_ctx_ = pair_allocator_ctx;
176 
177  hash_fn_.key_size_in_int_ = dsize_key / sizeof(int);
178  cmp_fn_.key_size_in_int_ = dsize_key / sizeof(int);
179 }
180 
181 template <typename Hash, typename KeyEq>
183  bool lane_active,
184  uint32_t lane_id,
185  uint32_t bucket_id,
186  const void* key,
187  addr_t iterator_addr) {
188  uint32_t work_queue = 0;
189  uint32_t prev_work_queue = 0;
190  uint32_t curr_slab_ptr = kHeadSlabAddr;
191  uint8_t src_key[kMaxKeyByteSize];
192 
193  bool mask = false;
194 
195  // > Loop when we have active lanes
196  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
197  // 0. Restart from linked list head if last insertion is finished
198  curr_slab_ptr =
199  (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
200  uint32_t src_lane = __ffs(work_queue) - 1;
201  uint32_t src_bucket =
202  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
203 
204  WarpSyncKey(key, src_lane, src_key);
205 
206  // Each lane in the warp reads a unit in the slab
207  uint32_t unit_data =
208  (curr_slab_ptr == kHeadSlabAddr)
209  ? *(get_unit_ptr_from_list_head(src_bucket, lane_id))
210  : *(get_unit_ptr_from_list_nodes(curr_slab_ptr,
211  lane_id));
212 
213  int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data);
214  int32_t lane_empty = WarpFindEmpty(unit_data);
215 
216  // Branch 1: key already existing, ABORT
217  if (lane_found >= 0) {
218  if (lane_id == src_lane) {
219  // free memory heap
220  lane_active = false;
221  }
222  }
223 
224  // Branch 2: empty slot available, try to insert
225  else if (lane_empty >= 0) {
226  if (lane_id == src_lane) {
227  // TODO: check why we cannot put malloc here
228  const uint32_t* unit_data_ptr =
229  (curr_slab_ptr == kHeadSlabAddr)
230  ? get_unit_ptr_from_list_head(src_bucket,
231  lane_empty)
232  : get_unit_ptr_from_list_nodes(curr_slab_ptr,
233  lane_empty);
234 
235  addr_t old_iterator_addr =
236  atomicCAS((unsigned int*)unit_data_ptr, kEmptyNodeAddr,
237  iterator_addr);
238 
239  // Remember to clean up in another pass
240  // Branch 2.1: SUCCEED
241  if (old_iterator_addr == kEmptyNodeAddr) {
242  lane_active = false;
243  mask = true;
244  }
245  // Branch 2.2: failed: RESTART
246  // In the consequent attempt,
247  // > if the same key was inserted in this slot,
248  // we fall back to Branch 1;
249  // > if a different key was inserted,
250  // we go to Branch 2 or 3.
251  }
252  }
253 
254  // Branch 3: nothing found in this slab, goto next slab
255  else {
256  // broadcast next slab
257  addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
258  kNextSlabPtrLaneId, kWarpSize);
259 
260  // Branch 3.1: next slab existing, RESTART this lane
261  if (next_slab_ptr != kEmptySlabAddr) {
262  curr_slab_ptr = next_slab_ptr;
263  }
264 
265  // Branch 3.2: next slab empty, try to allocate one
266  else {
267  addr_t new_next_slab_ptr = AllocateSlab(lane_id);
268 
269  if (lane_id == kNextSlabPtrLaneId) {
270  const uint32_t* unit_data_ptr =
271  (curr_slab_ptr == kHeadSlabAddr)
273  src_bucket, kNextSlabPtrLaneId)
275  curr_slab_ptr,
276  kNextSlabPtrLaneId);
277 
278  addr_t old_next_slab_ptr =
279  atomicCAS((unsigned int*)unit_data_ptr,
280  kEmptySlabAddr, new_next_slab_ptr);
281 
282  // Branch 3.2.1: other thread allocated, RESTART lane. In
283  // the consequent attempt, goto Branch 2'
284  if (old_next_slab_ptr != kEmptySlabAddr) {
285  FreeSlab(new_next_slab_ptr);
286  }
287  // Branch 3.2.2: this thread allocated, RESTART lane, 'goto
288  // Branch 2'
289  }
290  }
291  }
292 
293  prev_work_queue = work_queue;
294  }
295 
296  return mask;
297 }
298 
299 template <typename Hash, typename KeyEq>
301  bool lane_active,
302  uint32_t lane_id,
303  uint32_t bucket_id,
304  const void* query_key) {
305  uint32_t work_queue = 0;
306  uint32_t prev_work_queue = work_queue;
307  uint32_t curr_slab_ptr = kHeadSlabAddr;
308 
309  addr_t iterator = kNullAddr;
310  bool mask = false;
311 
312  // > Loop when we have active lanes.
313  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
314  // 0. Restart from linked list head if the last query is finished.
315  curr_slab_ptr =
316  (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
317  uint32_t src_lane = __ffs(work_queue) - 1;
318  uint32_t src_bucket =
319  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
320 
321  uint8_t src_key[kMaxKeyByteSize];
322  WarpSyncKey(query_key, src_lane, src_key);
323 
324  // Each lane in the warp reads a unit in the slab in parallel.
325  const uint32_t unit_data =
326  (curr_slab_ptr == kHeadSlabAddr)
327  ? *(get_unit_ptr_from_list_head(src_bucket, lane_id))
328  : *(get_unit_ptr_from_list_nodes(curr_slab_ptr,
329  lane_id));
330 
331  int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data);
332 
333  // 1. Found in this slab, SUCCEED.
334  if (lane_found >= 0) {
335  // broadcast found value
336  addr_t found_pair_internal_ptr = __shfl_sync(
337  kSyncLanesMask, unit_data, lane_found, kWarpSize);
338 
339  if (lane_id == src_lane) {
340  lane_active = false;
341 
342  // Actually iterator_addr
343  iterator = found_pair_internal_ptr;
344  mask = true;
345  }
346  }
347 
348  // 2. Not found in this slab.
349  else {
350  // Broadcast next slab: lane 31 reads 'next'.
351  addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
352  kNextSlabPtrLaneId, kWarpSize);
353 
354  // 2.1. Next slab is empty, ABORT.
355  if (next_slab_ptr == kEmptySlabAddr) {
356  if (lane_id == src_lane) {
357  lane_active = false;
358  }
359  }
360  // 2.2. Next slab exists, RESTART.
361  else {
362  curr_slab_ptr = next_slab_ptr;
363  }
364  }
365 
366  prev_work_queue = work_queue;
367  }
368 
369  return make_pair(iterator, mask);
370 }
371 
372 template <typename Hash, typename KeyEq>
374  bool lane_active,
375  uint32_t lane_id,
376  uint32_t bucket_id,
377  const void* key) {
378  uint32_t work_queue = 0;
379  uint32_t prev_work_queue = 0;
380  uint32_t curr_slab_ptr = kHeadSlabAddr;
381  uint8_t src_key[kMaxKeyByteSize];
382 
383  addr_t iterator_addr = 0;
384  bool mask = false;
385 
386  // > Loop when we have active lanes.
387  while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
388  // 0. Restart from linked list head if last insertion is finished.
389  curr_slab_ptr =
390  (prev_work_queue != work_queue) ? kHeadSlabAddr : curr_slab_ptr;
391  uint32_t src_lane = __ffs(work_queue) - 1;
392  uint32_t src_bucket =
393  __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
394 
395  WarpSyncKey(key, src_lane, src_key);
396 
397  const uint32_t unit_data =
398  (curr_slab_ptr == kHeadSlabAddr)
399  ? *(get_unit_ptr_from_list_head(src_bucket, lane_id))
400  : *(get_unit_ptr_from_list_nodes(curr_slab_ptr,
401  lane_id));
402 
403  int32_t lane_found = WarpFindKey(src_key, lane_id, unit_data);
404 
405  // Branch 1: key found.
406  if (lane_found >= 0) {
407  if (lane_id == src_lane) {
408  uint32_t* unit_data_ptr =
409  (curr_slab_ptr == kHeadSlabAddr)
410  ? get_unit_ptr_from_list_head(src_bucket,
411  lane_found)
412  : get_unit_ptr_from_list_nodes(curr_slab_ptr,
413  lane_found);
414 
415  uint32_t pair_to_delete = atomicExch(
416  (unsigned int*)unit_data_ptr, kEmptyNodeAddr);
417  mask = pair_to_delete != kEmptyNodeAddr;
418  iterator_addr = pair_to_delete;
419  // Branch 1.2: other thread did the job, avoid double free
420  }
421  } else { // no matching slot found:
422  addr_t next_slab_ptr = __shfl_sync(kSyncLanesMask, unit_data,
423  kNextSlabPtrLaneId, kWarpSize);
424  if (next_slab_ptr == kEmptySlabAddr) {
425  // not found:
426  if (lane_id == src_lane) {
427  lane_active = false;
428  }
429  } else {
430  curr_slab_ptr = next_slab_ptr;
431  }
432  }
433  prev_work_queue = work_queue;
434  }
435 
436  return make_pair(iterator_addr, mask);
437 }
438 
439 template <typename Hash, typename KeyEq>
441  const void* key_ptr, uint32_t lane_id, void* ret_key_ptr) {
442  auto dst_key_ptr = static_cast<int*>(ret_key_ptr);
443  auto src_key_ptr = static_cast<const int*>(key_ptr);
444  for (int i = 0; i < hash_fn_.key_size_in_int_; ++i) {
445  dst_key_ptr[i] =
446  __shfl_sync(kSyncLanesMask, src_key_ptr[i], lane_id, kWarpSize);
447  }
448 }
449 
450 template <typename Hash, typename KeyEq>
452  const void* key_ptr, uint32_t lane_id, addr_t ptr) {
453  bool is_lane_found =
454  // Select key lanes.
455  ((1 << lane_id) & kNodePtrLanesMask)
456  // Validate key addrs.
457  && (ptr != kEmptyNodeAddr)
458  // Find keys in memory heap.
459  && cmp_fn_(kv_mgr_ctx_.ExtractIterator(ptr).first, key_ptr);
460 
461  return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_found)) - 1;
462 }
463 
464 template <typename Hash, typename KeyEq>
465 __device__ int32_t
467  bool is_lane_empty = (ptr == kEmptyNodeAddr);
468  return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_empty)) - 1;
469 }
470 
471 template <typename Hash, typename KeyEq>
472 __device__ int64_t
474  return hash_fn_(key) % bucket_count_;
475 }
476 
477 template <typename Hash, typename KeyEq>
478 __device__ addr_t
480  return node_mgr_ctx_.WarpAllocate(lane_id);
481 }
482 
483 template <typename Hash, typename KeyEq>
484 __device__ __forceinline__ void CUDAHashmapImplContext<Hash, KeyEq>::FreeSlab(
485  addr_t slab_ptr) {
486  node_mgr_ctx_.FreeUntouched(slab_ptr);
487 }
488 
489 template <typename Hash, typename KeyEq>
491  const void* input_keys,
492  addr_t* output_addrs,
493  int heap_counter_prev,
494  int64_t count) {
495  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
496 
497  if (tid < count) {
498  // First write ALL input_keys to avoid potential thread conflicts.
499  addr_t iterator_addr =
500  hash_ctx.kv_mgr_ctx_.heap_[heap_counter_prev + tid];
501  iterator_t iterator =
502  hash_ctx.kv_mgr_ctx_.ExtractIterator(iterator_addr);
503 
504  MEMCPY_AS_INTS(iterator.first,
505  static_cast<const uint8_t*>(input_keys) +
506  tid * hash_ctx.dsize_key_,
507  hash_ctx.dsize_key_);
508  output_addrs[tid] = iterator_addr;
509  }
510 }
511 
512 template <typename Hash, typename KeyEq>
514  const void* input_keys,
515  addr_t* output_addrs,
516  bool* output_masks,
517  int64_t count) {
518  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
519  uint32_t lane_id = tid & 0x1F;
520 
521  if (tid - lane_id >= count) {
522  return;
523  }
524 
525  hash_ctx.node_mgr_ctx_.Init(tid, lane_id);
526 
527  bool lane_active = false;
528  uint32_t bucket_id = 0;
529  addr_t iterator_addr = 0;
530 
531  // Dummy.
532  uint8_t dummy_key[kMaxKeyByteSize];
533  const void* key = reinterpret_cast<const void*>(dummy_key);
534 
535  if (tid < count) {
536  lane_active = true;
537  key = static_cast<const uint8_t*>(input_keys) +
538  tid * hash_ctx.dsize_key_;
539  iterator_addr = output_addrs[tid];
540  bucket_id = hash_ctx.ComputeBucket(key);
541  }
542 
543  // Index out-of-bound threads still have to run for warp synchronization.
544  bool mask = hash_ctx.Insert(lane_active, lane_id, bucket_id, key,
545  iterator_addr);
546 
547  if (tid < count) {
548  output_masks[tid] = mask;
549  }
550 }
551 
552 template <typename Hash, typename KeyEq>
554  const void* input_values,
555  addr_t* output_addrs,
556  bool* output_masks,
557  int64_t count) {
558  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
559 
560  if (tid < count) {
561  addr_t iterator_addr = output_addrs[tid];
562 
563  if (output_masks[tid]) {
564  iterator_t iterator =
565  hash_ctx.kv_mgr_ctx_.ExtractIterator(iterator_addr);
566 
567  // Success: copy remaining input_values
568  if (input_values != nullptr) {
569  MEMCPY_AS_INTS(iterator.second,
570  static_cast<const uint8_t*>(input_values) +
571  tid * hash_ctx.dsize_value_,
572  hash_ctx.dsize_value_);
573  }
574 
575  } else {
576  hash_ctx.kv_mgr_ctx_.DeviceFree(iterator_addr);
577  }
578  }
579 }
580 
581 template <typename Hash, typename KeyEq>
583  const void* input_keys,
584  addr_t* output_addrs,
585  bool* output_masks,
586  int64_t count) {
587  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
588  uint32_t lane_id = threadIdx.x & 0x1F;
589 
590  // This warp is idle.
591  if ((tid - lane_id) >= count) {
592  return;
593  }
594 
595  // Initialize the memory allocator on each warp.
596  hash_ctx.node_mgr_ctx_.Init(tid, lane_id);
597 
598  bool lane_active = false;
599  uint32_t bucket_id = 0;
600 
601  // Dummy.
602  uint8_t dummy_key[kMaxKeyByteSize];
603  const void* key = reinterpret_cast<const void*>(dummy_key);
604  Pair<addr_t, bool> result;
605 
606  if (tid < count) {
607  lane_active = true;
608  key = static_cast<const uint8_t*>(input_keys) +
609  tid * hash_ctx.dsize_key_;
610  bucket_id = hash_ctx.ComputeBucket(key);
611  }
612 
613  result = hash_ctx.Find(lane_active, lane_id, bucket_id, key);
614 
615  if (tid < count) {
616  output_addrs[tid] = result.first;
617  output_masks[tid] = result.second;
618  }
619 }
620 
621 template <typename Hash, typename KeyEq>
623  const void* input_keys,
624  addr_t* output_addrs,
625  bool* output_masks,
626  int64_t count) {
627  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
628  uint32_t lane_id = threadIdx.x & 0x1F;
629 
630  if (tid - lane_id >= count) {
631  return;
632  }
633 
634  hash_ctx.node_mgr_ctx_.Init(tid, lane_id);
635 
636  bool lane_active = false;
637  uint32_t bucket_id = 0;
638 
639  uint8_t dummy_key[kMaxKeyByteSize];
640  const void* key = reinterpret_cast<const void*>(dummy_key);
641 
642  if (tid < count) {
643  lane_active = true;
644  key = static_cast<const uint8_t*>(input_keys) +
645  tid * hash_ctx.dsize_key_;
646  bucket_id = hash_ctx.ComputeBucket(key);
647  }
648 
649  auto result = hash_ctx.Erase(lane_active, lane_id, bucket_id, key);
650 
651  if (tid < count) {
652  output_addrs[tid] = result.first;
653  output_masks[tid] = result.second;
654  }
655 }
656 
657 template <typename Hash, typename KeyEq>
659  addr_t* output_addrs,
660  bool* output_masks,
661  int64_t count) {
662  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
663  if (tid < count && output_masks[tid]) {
664  hash_ctx.kv_mgr_ctx_.DeviceFree(output_addrs[tid]);
665  }
666 }
667 
668 template <typename Hash, typename KeyEq>
669 __global__ void GetActiveIndicesKernel(
671  addr_t* output_addrs,
672  uint32_t* output_iterator_count) {
673  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
674  uint32_t lane_id = threadIdx.x & 0x1F;
675 
676  // Assigning a warp per bucket.
677  uint32_t bucket_id = tid >> 5;
678  if (bucket_id >= hash_ctx.bucket_count_) {
679  return;
680  }
681 
682  hash_ctx.node_mgr_ctx_.Init(tid, lane_id);
683 
684  uint32_t src_unit_data =
685  *hash_ctx.get_unit_ptr_from_list_head(bucket_id, lane_id);
686  bool is_active = src_unit_data != kEmptyNodeAddr;
687 
688  if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
689  uint32_t index = atomicAdd(output_iterator_count, 1);
690  output_addrs[index] = src_unit_data;
691  }
692 
693  addr_t next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
694  kWarpSize);
695 
696  // Count following nodes,
697  while (next != kEmptySlabAddr) {
698  src_unit_data = *hash_ctx.get_unit_ptr_from_list_nodes(next, lane_id);
699  is_active = (src_unit_data != kEmptyNodeAddr);
700 
701  if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
702  uint32_t index = atomicAdd(output_iterator_count, 1);
703  output_addrs[index] = src_unit_data;
704  }
705  next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
706  kWarpSize);
707  }
708 }
709 
710 template <typename Hash, typename KeyEq>
713  int64_t* bucket_elem_counts) {
714  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
715  uint32_t lane_id = threadIdx.x & 0x1F;
716 
717  // Assigning a warp per bucket.
718  uint32_t bucket_id = tid >> 5;
719  if (bucket_id >= hash_ctx.bucket_count_) {
720  return;
721  }
722 
723  hash_ctx.node_mgr_ctx_.Init(tid, lane_id);
724 
725  uint32_t count = 0;
726 
727  // Count head node.
728  uint32_t src_unit_data =
729  *hash_ctx.get_unit_ptr_from_list_head(bucket_id, lane_id);
730  count += __popc(
731  __ballot_sync(kNodePtrLanesMask, src_unit_data != kEmptyNodeAddr));
732  addr_t next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
733  kWarpSize);
734 
735  // Count following nodes.
736  while (next != kEmptySlabAddr) {
737  src_unit_data = *hash_ctx.get_unit_ptr_from_list_nodes(next, lane_id);
738  count += __popc(__ballot_sync(kNodePtrLanesMask,
739  src_unit_data != kEmptyNodeAddr));
740  next = __shfl_sync(kSyncLanesMask, src_unit_data, kNextSlabPtrLaneId,
741  kWarpSize);
742  }
743 
744  // Write back the results.
745  if (lane_id == 0) {
746  bucket_elem_counts[bucket_id] = count;
747  }
748 }
749 
750 } // namespace core
751 } // namespace open3d
CUDAHashmapImplContext()
Definition: HashmapCUDAImpl.h:158
void * first
Definition: Traits.h:54
__host__ void Setup(int64_t init_buckets, int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const InternalNodeManagerContext &node_mgr_ctx, const CUDAHashmapBufferContext &kv_mgr_ctx)
Definition: HashmapCUDAImpl.h:162
int64_t dsize_value_
Definition: HashmapCUDAImpl.h:97
Hash hash_fn_
Definition: HashmapCUDAImpl.h:91
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: InternalNodeManager.h:97
__global__ void InsertKernelPass2(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:553
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
__global__ void EraseKernelPass0(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:622
__device__ Pair< addr_t, bool > Find(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const void *key_ptr)
Definition: HashmapCUDAImpl.h:300
__global__ void CountElemsPerBucketKernel(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, int64_t *bucket_elem_counts)
Definition: HashmapCUDAImpl.h:711
__device__ void WarpSyncKey(const void *key_ptr, uint32_t lane_id, void *ret_key_ptr)
Definition: HashmapCUDAImpl.h:440
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:398
__device__ addr_t AllocateSlab(uint32_t lane_id)
Definition: HashmapCUDAImpl.h:479
KeyEq cmp_fn_
Definition: HashmapCUDAImpl.h:92
Second second
Definition: Traits.h:61
__device__ void FreeUntouched(addr_t ptr)
Definition: InternalNodeManager.h:153
int64_t dsize_key_
Definition: HashmapCUDAImpl.h:96
__device__ int64_t ComputeBucket(const void *key_ptr) const
Definition: HashmapCUDAImpl.h:473
Definition: HashmapCUDAImpl.h:37
#define MEMCPY_AS_INTS(dst, src, num_bytes)
Definition: Macros.h:97
__device__ addr_t * get_unit_ptr_from_list_head(uint32_t bucket_id, uint32_t lane_id)
Definition: HashmapCUDAImpl.h:84
Definition: Traits.h:59
Slab * bucket_list_head_
Definition: HashmapCUDAImpl.h:99
math::float4 next
Definition: LineSetBuffers.cpp:63
__global__ void GetActiveIndicesKernel(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, addr_t *output_addrs, uint32_t *output_iterator_count)
Definition: HashmapCUDAImpl.h:669
__device__ int32_t WarpFindEmpty(addr_t unit_data)
Definition: HashmapCUDAImpl.h:466
First first
Definition: Traits.h:60
int64_t capacity_
Definition: HashmapCUDAImpl.h:95
Definition: Traits.h:49
Definition: InternalNodeManager.h:67
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c int
Definition: K4aPlugin.cpp:479
__device__ void FreeSlab(addr_t slab_ptr)
Definition: HashmapCUDAImpl.h:484
int count
Definition: FilePCD.cpp:61
__device__ int32_t WarpFindKey(const void *src_key_ptr, uint32_t lane_id, addr_t ptr)
Definition: HashmapCUDAImpl.h:451
__global__ void EraseKernelPass1(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:658
Definition: PinholeCameraIntrinsic.cpp:35
__global__ void FindKernel(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:582
CUDAHashmapBufferContext kv_mgr_ctx_
Definition: HashmapCUDAImpl.h:101
int64_t bucket_count_
Definition: HashmapCUDAImpl.h:94
__device__ Pair< addr_t, bool > Erase(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const void *key_ptr)
Definition: HashmapCUDAImpl.h:373
__global__ void InsertKernelPass1(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDAImpl.h:513
uint32_t addr_t
Definition: HashmapBuffer.h:58
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const addr_t &next_slab_ptr, const uint32_t &lane_id)
Definition: InternalNodeManager.h:76
Definition: InternalNodeManager.h:58
void * second
Definition: Traits.h:55
__global__ void InsertKernelPass0(CUDAHashmapImplContext< Hash, KeyEq > hash_ctx, const void *input_keys, addr_t *output_addrs, int heap_counter_prev, int64_t count)
Kernels.
Definition: HashmapCUDAImpl.h:490
InternalNodeManagerContext node_mgr_ctx_
Definition: HashmapCUDAImpl.h:100
OPEN3D_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: Traits.h:68
__device__ addr_t * get_unit_ptr_from_list_nodes(addr_t slab_ptr, uint32_t lane_id)
Definition: HashmapCUDAImpl.h:80
__device__ bool Insert(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const void *key_ptr, addr_t iterator_addr)
Definition: HashmapCUDAImpl.h:182