Open3D (C++ API)  0.18.0
StdGPUHashBackend.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2023 www.open3d.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <stdgpu/memory.h>
11 #include <thrust/device_vector.h>
12 #include <thrust/transform.h>
13 
14 #include <stdgpu/unordered_map.cuh>
15 #include <type_traits>
16 
17 #include "open3d/core/CUDAUtils.h"
22 
23 namespace open3d {
24 namespace core {
25 
31 template <typename T>
33 public:
35  using value_type = T;
36 
38  StdGPUAllocator() = default;
39 
41  explicit StdGPUAllocator(int device_id) : std_allocator_(device_id) {}
42 
44  StdGPUAllocator(const StdGPUAllocator&) = default;
45 
48 
51 
54 
56  template <typename U>
58  : std_allocator_(other.std_allocator_) {}
59 
62  T* p = std_allocator_.allocate(n);
63  stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);
64  return p;
65  }
66 
68  void deallocate(T* p, std::size_t n) {
69  stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);
70  std_allocator_.deallocate(p, n);
71  }
72 
74  bool operator==(const StdGPUAllocator& other) {
75  return std_allocator_ == other.std_allocator_;
76  }
77 
79  bool operator!=(const StdGPUAllocator& other) { return !operator==(other); }
80 
81 private:
82  // Allow access in rebind constructor.
83  template <typename T2>
84  friend class StdGPUAllocator;
85 
86  StdAllocator<T> std_allocator_;
87 };
88 
89 // These typedefs must be defined outside of StdGPUHashBackend to make them
90 // accessible in raw CUDA kernels.
91 template <typename Key>
94 
95 template <typename Key, typename Hash, typename Eq>
97  stdgpu::unordered_map<Key,
99  Hash,
100  Eq,
102 
103 template <typename Key, typename Hash, typename Eq>
105 public:
106  StdGPUHashBackend(int64_t init_capacity,
107  int64_t key_dsize,
108  const std::vector<int64_t>& value_dsizes,
109  const Device& device);
111 
112  void Reserve(int64_t capacity) override;
113 
114  void Insert(const void* input_keys,
115  const std::vector<const void*>& input_values_soa,
116  buf_index_t* output_buf_indices,
117  bool* output_masks,
118  int64_t count) override;
119 
120  void Find(const void* input_keys,
121  buf_index_t* output_buf_indices,
122  bool* output_masks,
123  int64_t count) override;
124 
125  void Erase(const void* input_keys,
126  bool* output_masks,
127  int64_t count) override;
128 
129  int64_t GetActiveIndices(buf_index_t* output_indices) override;
130 
131  void Clear() override;
132 
133  int64_t Size() const override;
134 
135  int64_t GetBucketCount() const override;
136  std::vector<int64_t> BucketSizes() const override;
137  float LoadFactor() const override;
138 
140 
141  void Allocate(int64_t capacity);
142  void Free();
143 
144 protected:
145  // Use reference, since the structure itself is implicitly handled as a
146  // pointer directly by stdgpu.
148 
150 };
151 
152 template <typename Key, typename Hash, typename Eq>
154  int64_t init_capacity,
155  int64_t key_dsize,
156  const std::vector<int64_t>& value_dsizes,
157  const Device& device)
158  : DeviceHashBackend(init_capacity, key_dsize, value_dsizes, device) {
159  CUDAScopedDevice scoped_device(this->device_);
160  Allocate(init_capacity);
161 }
162 
163 template <typename Key, typename Hash, typename Eq>
165  CUDAScopedDevice scoped_device(this->device_);
166  Free();
167 }
168 
169 template <typename Key, typename Hash, typename Eq>
171  CUDAScopedDevice scoped_device(this->device_);
172  return impl_.size();
173 }
174 
175 // Need an explicit kernel for non-const access to map
176 template <typename Key, typename Hash, typename Eq>
178  CUDAHashBackendBufferAccessor buffer_accessor,
179  const Key* input_keys,
180  buf_index_t* output_buf_indices,
181  bool* output_masks,
182  int64_t count) {
183  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
184  if (tid >= count) return;
185 
186  Key key = input_keys[tid];
187  auto iter = map.find(key);
188  bool flag = (iter != map.end());
189  output_masks[tid] = flag;
190  output_buf_indices[tid] = flag ? iter->second : 0;
191 }
192 
193 template <typename Key, typename Hash, typename Eq>
194 void StdGPUHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
195  buf_index_t* output_buf_indices,
196  bool* output_masks,
197  int64_t count) {
198  CUDAScopedDevice scoped_device(this->device_);
199  uint32_t threads = 128;
200  uint32_t blocks = (count + threads - 1) / threads;
201 
202  STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
203  impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
204  output_buf_indices, output_masks, count);
205  cuda::Synchronize(this->device_);
206 }
207 
208 // Need an explicit kernel for non-const access to map
209 template <typename Key, typename Hash, typename Eq>
211  CUDAHashBackendBufferAccessor buffer_accessor,
212  const Key* input_keys,
213  buf_index_t* output_buf_indices,
214  bool* output_masks,
215  int64_t count) {
216  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
217  if (tid >= count) return;
218 
219  Key key = input_keys[tid];
220  auto iter = map.find(key);
221  bool flag = (iter != map.end());
222  output_masks[tid] = flag;
223  output_buf_indices[tid] = flag ? iter->second : 0;
224 
225  if (output_masks[tid]) {
226  output_masks[tid] = map.erase(key);
227  if (output_masks[tid]) {
228  buffer_accessor.DeviceFree(output_buf_indices[tid]);
229  }
230  }
231 }
232 
233 template <typename Key, typename Hash, typename Eq>
234 void StdGPUHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
235  bool* output_masks,
236  int64_t count) {
237  CUDAScopedDevice scoped_device(this->device_);
238  uint32_t threads = 128;
239  uint32_t blocks = (count + threads - 1) / threads;
240 
241  core::Tensor toutput_buf_indices =
242  core::Tensor({count}, core::Int32, this->device_);
243  buf_index_t* output_buf_indices =
244  static_cast<buf_index_t*>(toutput_buf_indices.GetDataPtr());
245 
246  STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
247  impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
248  output_buf_indices, output_masks, count);
249  cuda::Synchronize(this->device_);
250 }
251 
252 template <typename Key>
255  operator()(const thrust::pair<Key, buf_index_t>& x) const {
256  return x.second;
257  }
258 };
259 
260 template <typename Key, typename Hash, typename Eq>
262  buf_index_t* output_indices) {
263  CUDAScopedDevice scoped_device(this->device_);
264  auto range = impl_.device_range();
265 
266  thrust::transform(range.begin(), range.end(), output_indices,
268 
269  return impl_.size();
270 }
271 
272 template <typename Key, typename Hash, typename Eq>
274  CUDAScopedDevice scoped_device(this->device_);
275  impl_.clear();
276  this->buffer_->ResetHeap();
277 }
278 
279 template <typename Key, typename Hash, typename Eq>
281  CUDAScopedDevice scoped_device(this->device_);
282 }
283 
284 template <typename Key, typename Hash, typename Eq>
286  CUDAScopedDevice scoped_device(this->device_);
287  return impl_.bucket_count();
288 }
289 
290 template <typename Key, typename Hash, typename Eq>
291 std::vector<int64_t> StdGPUHashBackend<Key, Hash, Eq>::BucketSizes() const {
292  CUDAScopedDevice scoped_device(this->device_);
293  utility::LogError("Unimplemented");
294 }
295 
296 template <typename Key, typename Hash, typename Eq>
298  CUDAScopedDevice scoped_device(this->device_);
299  return impl_.load_factor();
300 }
301 
302 // Need an explicit kernel for non-const access to map
303 template <typename Key, typename Hash, typename Eq, typename block_t>
304 __global__ void STDGPUInsertKernel(
306  CUDAHashBackendBufferAccessor buffer_accessor,
307  const Key* input_keys,
308  const void* const* input_values_soa,
309  buf_index_t* output_buf_indices,
310  bool* output_masks,
311  int64_t count,
312  int64_t n_values) {
313  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
314  if (tid >= count) return;
315 
316  Key key = input_keys[tid];
317  output_buf_indices[tid] = 0;
318  output_masks[tid] = false;
319 
320  // First apply 'try insert' with a dummy index
321  auto res = map.emplace(key, 0);
322 
323  // If success, change the iterator and provide the actual index
324  if (res.second) {
325  buf_index_t buf_index = buffer_accessor.DeviceAllocate();
326  auto key_ptr = buffer_accessor.GetKeyPtr(buf_index);
327 
328  // Copy templated key to buffer (duplicate)
329  // TODO: hack stdgpu inside and take out the buffer directly
330  *static_cast<Key*>(key_ptr) = key;
331 
332  // Copy/reset non-templated value in buffer
333  for (int j = 0; j < n_values; ++j) {
334  const int64_t blocks_per_element =
335  buffer_accessor.value_blocks_per_element_[j];
336 
337  block_t* dst_value = static_cast<block_t*>(
338  buffer_accessor.GetValuePtr(buf_index, j));
339  const block_t* src_value =
340  static_cast<const block_t*>(input_values_soa[j]) +
341  blocks_per_element * tid;
342  for (int b = 0; b < blocks_per_element; ++b) {
343  dst_value[b] = src_value[b];
344  }
345  }
346 
347  // Update from the dummy index
348  res.first->second = buf_index;
349 
350  // Write to return variables
351  output_buf_indices[tid] = buf_index;
352  output_masks[tid] = true;
353  }
354 }
355 
356 template <typename Key, typename Hash, typename Eq>
358  const void* input_keys,
359  const std::vector<const void*>& input_values_soa,
360  buf_index_t* output_buf_indices,
361  bool* output_masks,
362  int64_t count) {
363  CUDAScopedDevice scoped_device(this->device_);
364  uint32_t threads = 128;
365  uint32_t blocks = (count + threads - 1) / threads;
366 
367  thrust::device_vector<const void*> input_values_soa_device(
368  input_values_soa.begin(), input_values_soa.end());
369 
370  int64_t n_values = input_values_soa.size();
371  const void* const* ptr_input_values_soa =
372  thrust::raw_pointer_cast(input_values_soa_device.data());
373 
374  DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(
375  buffer_accessor_.common_block_size_, [&]() {
376  STDGPUInsertKernel<Key, Hash, Eq, block_t>
377  <<<blocks, threads, 0, core::cuda::GetStream()>>>(
378  impl_, buffer_accessor_,
379  static_cast<const Key*>(input_keys),
380  ptr_input_values_soa, output_buf_indices,
381  output_masks, count, n_values);
382  });
383  cuda::Synchronize(this->device_);
384 }
385 
386 template <typename Key, typename Hash, typename Eq>
388  CUDAScopedDevice scoped_device(this->device_);
389  this->capacity_ = capacity;
390 
391  // Allocate buffer for key values.
392  this->buffer_ = std::make_shared<HashBackendBuffer>(
393  this->capacity_, this->key_dsize_, this->value_dsizes_,
394  this->device_);
395  buffer_accessor_.Setup(*this->buffer_);
396 
397  // stdgpu initializes on the default stream. Set the current stream to
398  // ensure correct behavior.
399  {
400  CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
401 
403  this->capacity_,
404  InternalStdGPUHashBackendAllocator<Key>(this->device_.GetID()));
405  cuda::Synchronize(this->device_);
406  }
407 }
408 
409 template <typename Key, typename Hash, typename Eq>
411  CUDAScopedDevice scoped_device(this->device_);
412  // Buffer is automatically handled by the smart pointer.
413  buffer_accessor_.Shutdown(this->device_);
414 
415  // stdgpu initializes on the default stream. Set the current stream to
416  // ensure correct behavior.
417  {
418  CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
419 
421  }
422 }
423 } // namespace core
424 } // namespace open3d
Common CUDA utilities.
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:44
#define LogError(...)
Definition: Logging.h:48
Definition: CUDAHashBackendBufferAccessor.h:24
int64_t * value_blocks_per_element_
Definition: CUDAHashBackendBufferAccessor.h:108
__device__ buf_index_t DeviceAllocate()
Definition: CUDAHashBackendBufferAccessor.h:79
__device__ void DeviceFree(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:83
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:88
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition: CUDAHashBackendBufferAccessor.h:91
When CUDA is not enabled, this is a dummy class.
Definition: CUDAUtils.h:214
Definition: DeviceHashBackend.h:20
Device device_
Definition: DeviceHashBackend.h:100
Definition: Device.h:18
Definition: StdAllocator.h:23
Definition: StdGPUHashBackend.h:32
T * allocate(std::size_t n)
Allocates memory of size n.
Definition: StdGPUHashBackend.h:61
StdGPUAllocator()=default
Default constructor.
StdGPUAllocator(int device_id)
Constructor from device.
Definition: StdGPUHashBackend.h:41
StdGPUAllocator(const StdGPUAllocator &)=default
Default copy constructor.
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition: StdGPUHashBackend.h:68
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition: StdGPUHashBackend.h:74
StdGPUAllocator(StdGPUAllocator &&)=default
Default move constructor.
T value_type
T.
Definition: StdGPUHashBackend.h:35
StdGPUAllocator & operator=(StdGPUAllocator &&)=default
Default move assignment operator.
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition: StdGPUHashBackend.h:79
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition: StdGPUHashBackend.h:57
Definition: StdGPUHashBackend.h:104
StdGPUHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
Definition: StdGPUHashBackend.h:153
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: StdGPUHashBackend.h:234
~StdGPUHashBackend()
Definition: StdGPUHashBackend.h:164
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
Definition: StdGPUHashBackend.h:297
InternalStdGPUHashBackend< Key, Hash, Eq > GetImpl() const
Definition: StdGPUHashBackend.h:139
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: StdGPUHashBackend.h:194
void Free()
Definition: StdGPUHashBackend.h:410
void Insert(const void *input_keys, const std::vector< const void * > &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: StdGPUHashBackend.h:357
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
Definition: StdGPUHashBackend.h:291
InternalStdGPUHashBackend< Key, Hash, Eq > impl_
Definition: StdGPUHashBackend.h:147
void Reserve(int64_t capacity) override
Definition: StdGPUHashBackend.h:280
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
Definition: StdGPUHashBackend.h:285
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: StdGPUHashBackend.h:261
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
Definition: StdGPUHashBackend.h:170
void Allocate(int64_t capacity)
Definition: StdGPUHashBackend.h:387
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: StdGPUHashBackend.h:149
void Clear() override
Clear stored map without reallocating memory.
Definition: StdGPUHashBackend.h:273
Definition: Tensor.h:32
T * GetDataPtr()
Definition: Tensor.h:1143
int count
Definition: FilePCD.cpp:42
void Synchronize()
Definition: CUDAUtils.cpp:58
__global__ void STDGPUFindKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:177
uint32_t buf_index_t
Definition: HashBackendBuffer.h:44
__global__ void STDGPUEraseKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:210
const Dtype Int32
Definition: Dtype.cpp:46
stdgpu::unordered_map< Key, buf_index_t, Hash, Eq, InternalStdGPUHashBackendAllocator< Key > > InternalStdGPUHashBackend
Definition: StdGPUHashBackend.h:101
__global__ void STDGPUInsertKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: StdGPUHashBackend.h:304
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:548
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle image_handle timestamp_usec white_balance image_handle k4a_device_configuration_t config device_handle char size_t serial_number_size bool int32_t int32_t int32_t int32_t k4a_color_control_mode_t default_mode value const const k4a_calibration_t calibration char size_t
Definition: K4aPlugin.cpp:719
Definition: PinholeCameraIntrinsic.cpp:16
Definition: StdGPUHashBackend.h:253
OPEN3D_HOST_DEVICE buf_index_t operator()(const thrust::pair< Key, buf_index_t > &x) const
Definition: StdGPUHashBackend.h:255