Open3D (C++ API)  0.18.0
SlabHashBackend.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2023 www.open3d.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <memory>
11 
12 #include "open3d/core/CUDAUtils.h"
16 
17 namespace open3d {
18 namespace core {
19 template <typename Key, typename Hash, typename Eq>
21 public:
22  SlabHashBackend(int64_t init_capacity,
23  int64_t key_dsize,
24  const std::vector<int64_t>& value_dsizes,
25  const Device& device);
26 
28 
29  void Reserve(int64_t capacity) override;
30 
31  void Insert(const void* input_keys,
32  const std::vector<const void*>& input_values_soa,
33  buf_index_t* output_buf_indices,
34  bool* output_masks,
35  int64_t count) override;
36 
37  void Find(const void* input_keys,
38  buf_index_t* output_buf_indices,
39  bool* output_masks,
40  int64_t count) override;
41 
42  void Erase(const void* input_keys,
43  bool* output_masks,
44  int64_t count) override;
45 
46  int64_t GetActiveIndices(buf_index_t* output_indices) override;
47  void Clear() override;
48 
49  int64_t Size() const override;
50  int64_t GetBucketCount() const override;
51  std::vector<int64_t> BucketSizes() const override;
52  float LoadFactor() const override;
53 
55 
56  void Allocate(int64_t capacity) override;
57  void Free() override;
58 
59 protected:
63 
65  std::shared_ptr<SlabNodeManager> node_mgr_;
66 
67  int64_t bucket_count_;
68 };
69 
70 template <typename Key, typename Hash, typename Eq>
72  int64_t init_capacity,
73  int64_t key_dsize,
74  const std::vector<int64_t>& value_dsizes,
75  const Device& device)
76  : DeviceHashBackend(init_capacity, key_dsize, value_dsizes, device) {
77  CUDAScopedDevice scoped_device(this->device_);
78  Allocate(init_capacity);
79 }
80 
81 template <typename Key, typename Hash, typename Eq>
83  CUDAScopedDevice scoped_device(this->device_);
84  Free();
85 }
86 
87 template <typename Key, typename Hash, typename Eq>
89  CUDAScopedDevice scoped_device(this->device_);
90 }
91 
92 template <typename Key, typename Hash, typename Eq>
93 void SlabHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
94  buf_index_t* output_buf_indices,
95  bool* output_masks,
96  int64_t count) {
97  CUDAScopedDevice scoped_device(this->device_);
98  if (count == 0) return;
99 
100  OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
102  OPEN3D_CUDA_CHECK(cudaGetLastError());
103 
104  const int64_t num_blocks =
105  (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
106  FindKernel<<<num_blocks, kThreadsPerBlock, 0, core::cuda::GetStream()>>>(
107  impl_, input_keys, output_buf_indices, output_masks, count);
109  OPEN3D_CUDA_CHECK(cudaGetLastError());
110 }
111 
112 template <typename Key, typename Hash, typename Eq>
113 void SlabHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
114  bool* output_masks,
115  int64_t count) {
116  CUDAScopedDevice scoped_device(this->device_);
117  if (count == 0) return;
118 
119  OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
121  OPEN3D_CUDA_CHECK(cudaGetLastError());
122  auto buf_indices = static_cast<buf_index_t*>(
123  MemoryManager::Malloc(sizeof(buf_index_t) * count, this->device_));
124 
125  const int64_t num_blocks =
126  (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
127  EraseKernelPass0<<<num_blocks, kThreadsPerBlock, 0,
128  core::cuda::GetStream()>>>(
129  impl_, input_keys, buf_indices, output_masks, count);
130  EraseKernelPass1<<<num_blocks, kThreadsPerBlock, 0,
131  core::cuda::GetStream()>>>(impl_, buf_indices,
132  output_masks, count);
134  OPEN3D_CUDA_CHECK(cudaGetLastError());
135 
136  MemoryManager::Free(buf_indices, this->device_);
137 }
138 
139 template <typename Key, typename Hash, typename Eq>
141  buf_index_t* output_buf_indices) {
142  CUDAScopedDevice scoped_device(this->device_);
143  uint32_t* count = static_cast<uint32_t*>(
144  MemoryManager::Malloc(sizeof(uint32_t), this->device_));
145  OPEN3D_CUDA_CHECK(cudaMemset(count, 0, sizeof(uint32_t)));
146 
148  OPEN3D_CUDA_CHECK(cudaGetLastError());
149 
150  const int64_t num_blocks =
151  (impl_.bucket_count_ * kWarpSize + kThreadsPerBlock - 1) /
152  kThreadsPerBlock;
153  GetActiveIndicesKernel<<<num_blocks, kThreadsPerBlock, 0,
154  core::cuda::GetStream()>>>(
155  impl_, output_buf_indices, count);
157  OPEN3D_CUDA_CHECK(cudaGetLastError());
158 
159  uint32_t ret;
160  MemoryManager::MemcpyToHost(&ret, count, this->device_, sizeof(uint32_t));
161  MemoryManager::Free(count, this->device_);
162 
163  return static_cast<int64_t>(ret);
164 }
165 
166 template <typename Key, typename Hash, typename Eq>
168  CUDAScopedDevice scoped_device(this->device_);
169  // Clear the heap
170  this->buffer_->ResetHeap();
171 
172  // Clear the linked list heads
173  OPEN3D_CUDA_CHECK(cudaMemset(impl_.bucket_list_head_, 0xFF,
174  sizeof(Slab) * this->bucket_count_));
176  OPEN3D_CUDA_CHECK(cudaGetLastError());
177 
178  // Clear the linked list nodes
179  node_mgr_->Reset();
180 }
181 
182 template <typename Key, typename Hash, typename Eq>
184  CUDAScopedDevice scoped_device(this->device_);
185  return this->buffer_->GetHeapTopIndex();
186 }
187 
188 template <typename Key, typename Hash, typename Eq>
190  CUDAScopedDevice scoped_device(this->device_);
191  return bucket_count_;
192 }
193 
194 template <typename Key, typename Hash, typename Eq>
195 std::vector<int64_t> SlabHashBackend<Key, Hash, Eq>::BucketSizes() const {
196  CUDAScopedDevice scoped_device(this->device_);
197  thrust::device_vector<int64_t> elems_per_bucket(impl_.bucket_count_);
198  thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);
199 
200  const int64_t num_blocks =
201  (impl_.buffer_accessor_.capacity_ + kThreadsPerBlock - 1) /
202  kThreadsPerBlock;
203  CountElemsPerBucketKernel<<<num_blocks, kThreadsPerBlock, 0,
204  core::cuda::GetStream()>>>(
205  impl_, thrust::raw_pointer_cast(elems_per_bucket.data()));
207  OPEN3D_CUDA_CHECK(cudaGetLastError());
208 
209  std::vector<int64_t> result(impl_.bucket_count_);
210  thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
211  result.begin());
212  return result;
213 }
214 
215 template <typename Key, typename Hash, typename Eq>
217  CUDAScopedDevice scoped_device(this->device_);
218  return float(Size()) / float(this->bucket_count_);
219 }
220 
221 template <typename Key, typename Hash, typename Eq>
223  const void* input_keys,
224  const std::vector<const void*>& input_values_soa,
225  buf_index_t* output_buf_indices,
226  bool* output_masks,
227  int64_t count) {
228  CUDAScopedDevice scoped_device(this->device_);
229  if (count == 0) return;
230 
233  int prev_heap_top = this->buffer_->GetHeapTopIndex();
234  *thrust::device_ptr<int>(impl_.buffer_accessor_.heap_top_) =
235  prev_heap_top + count;
236 
237  const int64_t num_blocks =
238  (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
239  InsertKernelPass0<<<num_blocks, kThreadsPerBlock, 0,
240  core::cuda::GetStream()>>>(
241  impl_, input_keys, output_buf_indices, prev_heap_top, count);
242  InsertKernelPass1<<<num_blocks, kThreadsPerBlock, 0,
243  core::cuda::GetStream()>>>(
244  impl_, input_keys, output_buf_indices, output_masks, count);
245 
246  thrust::device_vector<const void*> input_values_soa_device(
247  input_values_soa.begin(), input_values_soa.end());
248 
249  int64_t n_values = input_values_soa.size();
250  const void* const* ptr_input_values_soa =
251  thrust::raw_pointer_cast(input_values_soa_device.data());
252  DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(
253  impl_.buffer_accessor_.common_block_size_, [&]() {
254  InsertKernelPass2<Key, Hash, Eq, block_t>
255  <<<num_blocks, kThreadsPerBlock, 0,
256  core::cuda::GetStream()>>>(
257  impl_, ptr_input_values_soa, output_buf_indices,
258  output_masks, count, n_values);
259  });
261  OPEN3D_CUDA_CHECK(cudaGetLastError());
262 }
263 
264 template <typename Key, typename Hash, typename Eq>
266  CUDAScopedDevice scoped_device(this->device_);
267  this->bucket_count_ = capacity * 2;
268  this->capacity_ = capacity;
269 
270  // Allocate buffer for key values.
271  this->buffer_ = std::make_shared<HashBackendBuffer>(
272  this->capacity_, this->key_dsize_, this->value_dsizes_,
273  this->device_);
274  buffer_accessor_.Setup(*this->buffer_);
275 
276  // Allocate buffer for linked list nodes.
277  node_mgr_ = std::make_shared<SlabNodeManager>(this->device_);
278 
279  // Allocate linked list heads.
280  impl_.bucket_list_head_ = static_cast<Slab*>(MemoryManager::Malloc(
281  sizeof(Slab) * this->bucket_count_, this->device_));
282  OPEN3D_CUDA_CHECK(cudaMemset(impl_.bucket_list_head_, 0xFF,
283  sizeof(Slab) * this->bucket_count_));
285  OPEN3D_CUDA_CHECK(cudaGetLastError());
286 
287  impl_.Setup(this->bucket_count_, node_mgr_->impl_, buffer_accessor_);
288 }
289 
290 template <typename Key, typename Hash, typename Eq>
292  CUDAScopedDevice scoped_device(this->device_);
293  buffer_accessor_.Shutdown(this->device_);
294  MemoryManager::Free(impl_.bucket_list_head_, this->device_);
295 }
296 } // namespace core
297 } // namespace open3d
Common CUDA utilities.
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:47
core::Tensor result
Definition: VtkUtils.cpp:75
bool copy
Definition: VtkUtils.cpp:73
Definition: CUDAHashBackendBufferAccessor.h:24
When CUDA is not enabled, this is a dummy class.
Definition: CUDAUtils.h:214
Definition: DeviceHashBackend.h:20
Device device_
Definition: DeviceHashBackend.h:100
Definition: Device.h:18
static void MemcpyToHost(void *host_ptr, const void *src_ptr, const Device &src_device, size_t num_bytes)
Same as Memcpy, but with host (CPU:0) as default dst_device.
Definition: MemoryManager.cpp:85
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:22
static void Free(void *ptr, const Device &device)
Frees previously allocated memory at address ptr on device device.
Definition: MemoryManager.cpp:28
Definition: SlabHashBackend.h:20
SlabHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
Definition: SlabHashBackend.h:71
void Free() override
Definition: SlabHashBackend.h:291
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: SlabHashBackend.h:64
void Allocate(int64_t capacity) override
Definition: SlabHashBackend.h:265
~SlabHashBackend()
Definition: SlabHashBackend.h:82
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
Definition: SlabHashBackend.h:216
std::shared_ptr< SlabNodeManager > node_mgr_
Definition: SlabHashBackend.h:65
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: SlabHashBackend.h:140
SlabHashBackendImpl< Key, Hash, Eq > impl_
Definition: SlabHashBackend.h:62
void Insert(const void *input_keys, const std::vector< const void * > &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: SlabHashBackend.h:222
SlabHashBackendImpl< Key, Hash, Eq > GetImpl()
Definition: SlabHashBackend.h:54
int64_t bucket_count_
Definition: SlabHashBackend.h:67
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
Definition: SlabHashBackend.h:183
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
Definition: SlabHashBackend.h:189
void Reserve(int64_t capacity) override
Definition: SlabHashBackend.h:88
void Clear() override
Clear stored map without reallocating memory.
Definition: SlabHashBackend.h:167
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
Definition: SlabHashBackend.h:195
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: SlabHashBackend.h:93
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: SlabHashBackend.h:113
Definition: SlabHashBackendImpl.h:45
Definition: SlabNodeManager.h:39
int count
Definition: FilePCD.cpp:42
void Synchronize()
Definition: CUDAUtils.cpp:58
__global__ void InsertKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:493
__global__ void InsertKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, int heap_counter_prev, int64_t count)
Kernels.
Definition: SlabHashBackendImpl.h:474
uint32_t buf_index_t
Definition: HashBackendBuffer.h:44
__global__ void EraseKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:636
__global__ void FindKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:562
__global__ void EraseKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:601
__global__ void GetActiveIndicesKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, uint32_t *output_count)
Definition: SlabHashBackendImpl.h:647
__global__ void CountElemsPerBucketKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, int64_t *bucket_elem_counts)
Definition: SlabHashBackendImpl.h:687
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:548
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle float
Definition: K4aPlugin.cpp:460
Definition: PinholeCameraIntrinsic.cpp:16