Loading [MathJax]/extensions/TeX/AMSmath.js
Open3D (C++ API)  0.19.0
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
StdGPUHashBackend.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // Copyright (c) 2018-2024 www.open3d.org
5 // SPDX-License-Identifier: MIT
6 // ----------------------------------------------------------------------------
7 
8 #pragma once
9 
10 #include <stdgpu/memory.h>
11 #include <stdgpu/utility.h>
12 #include <thrust/device_vector.h>
13 #include <thrust/transform.h>
14 
15 #include <stdgpu/unordered_map.cuh>
16 #include <type_traits>
17 
18 #include "open3d/core/CUDAUtils.h"
23 
24 namespace open3d {
25 namespace core {
26 
32 template <typename T>
34 public:
36  using value_type = T;
37 
39  StdGPUAllocator() = default;
40 
42  explicit StdGPUAllocator(int device_id) : std_allocator_(device_id) {}
43 
45  StdGPUAllocator(const StdGPUAllocator&) = default;
46 
49 
52 
55 
57  template <typename U>
59  : std_allocator_(other.std_allocator_) {}
60 
63  T* p = std_allocator_.allocate(n);
64  stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);
65  return p;
66  }
67 
69  void deallocate(T* p, std::size_t n) {
70  stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);
71  std_allocator_.deallocate(p, n);
72  }
73 
75  bool operator==(const StdGPUAllocator& other) {
76  return std_allocator_ == other.std_allocator_;
77  }
78 
80  bool operator!=(const StdGPUAllocator& other) { return !operator==(other); }
81 
82 private:
83  // Allow access in rebind constructor.
84  template <typename T2>
85  friend class StdGPUAllocator;
86 
87  StdAllocator<T> std_allocator_;
88 };
89 
90 // These typedefs must be defined outside of StdGPUHashBackend to make them
91 // accessible in raw CUDA kernels.
92 template <typename Key>
95 
96 template <typename Key, typename Hash, typename Eq>
98  stdgpu::unordered_map<Key,
100  Hash,
101  Eq,
103 
104 template <typename Key, typename Hash, typename Eq>
106 public:
107  StdGPUHashBackend(int64_t init_capacity,
108  int64_t key_dsize,
109  const std::vector<int64_t>& value_dsizes,
110  const Device& device);
112 
113  void Reserve(int64_t capacity) override;
114 
115  void Insert(const void* input_keys,
116  const std::vector<const void*>& input_values_soa,
117  buf_index_t* output_buf_indices,
118  bool* output_masks,
119  int64_t count) override;
120 
121  void Find(const void* input_keys,
122  buf_index_t* output_buf_indices,
123  bool* output_masks,
124  int64_t count) override;
125 
126  void Erase(const void* input_keys,
127  bool* output_masks,
128  int64_t count) override;
129 
130  int64_t GetActiveIndices(buf_index_t* output_indices) override;
131 
132  void Clear() override;
133 
134  int64_t Size() const override;
135 
136  int64_t GetBucketCount() const override;
137  std::vector<int64_t> BucketSizes() const override;
138  float LoadFactor() const override;
139 
141 
142  void Allocate(int64_t capacity);
143  void Free();
144 
145 protected:
146  // Use reference, since the structure itself is implicitly handled as a
147  // pointer directly by stdgpu.
149 
151 };
152 
153 template <typename Key, typename Hash, typename Eq>
155  int64_t init_capacity,
156  int64_t key_dsize,
157  const std::vector<int64_t>& value_dsizes,
158  const Device& device)
159  : DeviceHashBackend(init_capacity, key_dsize, value_dsizes, device) {
160  CUDAScopedDevice scoped_device(this->device_);
161  Allocate(init_capacity);
162 }
163 
164 template <typename Key, typename Hash, typename Eq>
166  CUDAScopedDevice scoped_device(this->device_);
167  Free();
168 }
169 
170 template <typename Key, typename Hash, typename Eq>
172  CUDAScopedDevice scoped_device(this->device_);
173  return impl_.size();
174 }
175 
176 // Need an explicit kernel for non-const access to map
177 template <typename Key, typename Hash, typename Eq>
179  CUDAHashBackendBufferAccessor buffer_accessor,
180  const Key* input_keys,
181  buf_index_t* output_buf_indices,
182  bool* output_masks,
183  int64_t count) {
184  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
185  if (tid >= count) return;
186 
187  Key key = input_keys[tid];
188  auto iter = map.find(key);
189  bool flag = (iter != map.end());
190  output_masks[tid] = flag;
191  output_buf_indices[tid] = flag ? iter->second : 0;
192 }
193 
194 template <typename Key, typename Hash, typename Eq>
195 void StdGPUHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
196  buf_index_t* output_buf_indices,
197  bool* output_masks,
198  int64_t count) {
199  CUDAScopedDevice scoped_device(this->device_);
200  uint32_t threads = 128;
201  uint32_t blocks = (count + threads - 1) / threads;
202 
203  STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
204  impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
205  output_buf_indices, output_masks, count);
206  cuda::Synchronize(this->device_);
207 }
208 
209 // Need an explicit kernel for non-const access to map
210 template <typename Key, typename Hash, typename Eq>
212  CUDAHashBackendBufferAccessor buffer_accessor,
213  const Key* input_keys,
214  buf_index_t* output_buf_indices,
215  bool* output_masks,
216  int64_t count) {
217  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
218  if (tid >= count) return;
219 
220  Key key = input_keys[tid];
221  auto iter = map.find(key);
222  bool flag = (iter != map.end());
223  output_masks[tid] = flag;
224  output_buf_indices[tid] = flag ? iter->second : 0;
225 
226  if (output_masks[tid]) {
227  output_masks[tid] = map.erase(key);
228  if (output_masks[tid]) {
229  buffer_accessor.DeviceFree(output_buf_indices[tid]);
230  }
231  }
232 }
233 
234 template <typename Key, typename Hash, typename Eq>
235 void StdGPUHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
236  bool* output_masks,
237  int64_t count) {
238  CUDAScopedDevice scoped_device(this->device_);
239  uint32_t threads = 128;
240  uint32_t blocks = (count + threads - 1) / threads;
241 
242  core::Tensor toutput_buf_indices =
243  core::Tensor({count}, core::Int32, this->device_);
244  buf_index_t* output_buf_indices =
245  static_cast<buf_index_t*>(toutput_buf_indices.GetDataPtr());
246 
247  STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
248  impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
249  output_buf_indices, output_masks, count);
250  cuda::Synchronize(this->device_);
251 }
252 
253 template <typename Key>
256  operator()(const stdgpu::pair<Key, buf_index_t>& x) const {
257  return x.second;
258  }
259 };
260 
261 template <typename Key, typename Hash, typename Eq>
263  buf_index_t* output_indices) {
264  CUDAScopedDevice scoped_device(this->device_);
265  auto range = impl_.device_range();
266 
267  thrust::transform(range.begin(), range.end(), output_indices,
269 
270  return impl_.size();
271 }
272 
273 template <typename Key, typename Hash, typename Eq>
275  CUDAScopedDevice scoped_device(this->device_);
276  impl_.clear();
277  this->buffer_->ResetHeap();
278 }
279 
280 template <typename Key, typename Hash, typename Eq>
282  CUDAScopedDevice scoped_device(this->device_);
283 }
284 
285 template <typename Key, typename Hash, typename Eq>
287  CUDAScopedDevice scoped_device(this->device_);
288  return impl_.bucket_count();
289 }
290 
291 template <typename Key, typename Hash, typename Eq>
292 std::vector<int64_t> StdGPUHashBackend<Key, Hash, Eq>::BucketSizes() const {
293  CUDAScopedDevice scoped_device(this->device_);
294  utility::LogError("Unimplemented");
295 }
296 
297 template <typename Key, typename Hash, typename Eq>
299  CUDAScopedDevice scoped_device(this->device_);
300  return impl_.load_factor();
301 }
302 
303 // Need an explicit kernel for non-const access to map
304 template <typename Key, typename Hash, typename Eq, typename block_t>
305 __global__ void STDGPUInsertKernel(
307  CUDAHashBackendBufferAccessor buffer_accessor,
308  const Key* input_keys,
309  const void* const* input_values_soa,
310  buf_index_t* output_buf_indices,
311  bool* output_masks,
312  int64_t count,
313  int64_t n_values) {
314  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
315  if (tid >= count) return;
316 
317  Key key = input_keys[tid];
318  output_buf_indices[tid] = 0;
319  output_masks[tid] = false;
320 
321  // First apply 'try insert' with a dummy index
322  auto res = map.emplace(key, 0);
323 
324  // If success, change the iterator and provide the actual index
325  if (res.second) {
326  buf_index_t buf_index = buffer_accessor.DeviceAllocate();
327  auto key_ptr = buffer_accessor.GetKeyPtr(buf_index);
328 
329  // Copy templated key to buffer (duplicate)
330  // TODO: hack stdgpu inside and take out the buffer directly
331  *static_cast<Key*>(key_ptr) = key;
332 
333  // Copy/reset non-templated value in buffer
334  for (int j = 0; j < n_values; ++j) {
335  const int64_t blocks_per_element =
336  buffer_accessor.value_blocks_per_element_[j];
337 
338  block_t* dst_value = static_cast<block_t*>(
339  buffer_accessor.GetValuePtr(buf_index, j));
340  const block_t* src_value =
341  static_cast<const block_t*>(input_values_soa[j]) +
342  blocks_per_element * tid;
343  for (int b = 0; b < blocks_per_element; ++b) {
344  dst_value[b] = src_value[b];
345  }
346  }
347 
348  // Update from the dummy index
349  res.first->second = buf_index;
350 
351  // Write to return variables
352  output_buf_indices[tid] = buf_index;
353  output_masks[tid] = true;
354  }
355 }
356 
357 template <typename Key, typename Hash, typename Eq>
359  const void* input_keys,
360  const std::vector<const void*>& input_values_soa,
361  buf_index_t* output_buf_indices,
362  bool* output_masks,
363  int64_t count) {
364  CUDAScopedDevice scoped_device(this->device_);
365  uint32_t threads = 128;
366  uint32_t blocks = (count + threads - 1) / threads;
367 
368  thrust::device_vector<const void*> input_values_soa_device(
369  input_values_soa.begin(), input_values_soa.end());
370 
371  int64_t n_values = input_values_soa.size();
372  const void* const* ptr_input_values_soa =
373  thrust::raw_pointer_cast(input_values_soa_device.data());
374 
375  DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(
376  buffer_accessor_.common_block_size_, [&]() {
377  STDGPUInsertKernel<Key, Hash, Eq, block_t>
378  <<<blocks, threads, 0, core::cuda::GetStream()>>>(
379  impl_, buffer_accessor_,
380  static_cast<const Key*>(input_keys),
381  ptr_input_values_soa, output_buf_indices,
382  output_masks, count, n_values);
383  });
384  cuda::Synchronize(this->device_);
385 }
386 
387 template <typename Key, typename Hash, typename Eq>
389  CUDAScopedDevice scoped_device(this->device_);
390  this->capacity_ = capacity;
391 
392  // Allocate buffer for key values.
393  this->buffer_ = std::make_shared<HashBackendBuffer>(
394  this->capacity_, this->key_dsize_, this->value_dsizes_,
395  this->device_);
396  buffer_accessor_.Setup(*this->buffer_);
397 
398  // stdgpu initializes on the default stream. Set the current stream to
399  // ensure correct behavior.
400  {
401  CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
402 
404  this->capacity_,
405  InternalStdGPUHashBackendAllocator<Key>(this->device_.GetID()));
406  cuda::Synchronize(this->device_);
407  }
408 }
409 
410 template <typename Key, typename Hash, typename Eq>
412  CUDAScopedDevice scoped_device(this->device_);
413  // Buffer is automatically handled by the smart pointer.
414  buffer_accessor_.Shutdown(this->device_);
415 
416  // stdgpu initializes on the default stream. Set the current stream to
417  // ensure correct behavior.
418  {
419  CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
420 
422  }
423 }
424 } // namespace core
425 } // namespace open3d
Common CUDA utilities.
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:44
#define LogError(...)
Definition: Logging.h:51
Definition: CUDAHashBackendBufferAccessor.h:24
int64_t * value_blocks_per_element_
Definition: CUDAHashBackendBufferAccessor.h:108
__device__ buf_index_t DeviceAllocate()
Definition: CUDAHashBackendBufferAccessor.h:79
__device__ void DeviceFree(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:83
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:88
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition: CUDAHashBackendBufferAccessor.h:91
When CUDA is not enabled, this is a dummy class.
Definition: CUDAUtils.h:214
Definition: DeviceHashBackend.h:20
Device device_
Definition: DeviceHashBackend.h:100
Definition: Device.h:18
Definition: StdAllocator.h:23
Definition: StdGPUHashBackend.h:33
T * allocate(std::size_t n)
Allocates memory of size n.
Definition: StdGPUHashBackend.h:62
StdGPUAllocator()=default
Default constructor.
StdGPUAllocator(int device_id)
Constructor from device.
Definition: StdGPUHashBackend.h:42
StdGPUAllocator(const StdGPUAllocator &)=default
Default copy constructor.
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition: StdGPUHashBackend.h:69
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition: StdGPUHashBackend.h:75
StdGPUAllocator(StdGPUAllocator &&)=default
Default move constructor.
T value_type
T.
Definition: StdGPUHashBackend.h:36
StdGPUAllocator & operator=(StdGPUAllocator &&)=default
Default move assignment operator.
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition: StdGPUHashBackend.h:80
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition: StdGPUHashBackend.h:58
Definition: StdGPUHashBackend.h:105
StdGPUHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
Definition: StdGPUHashBackend.h:154
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: StdGPUHashBackend.h:235
~StdGPUHashBackend()
Definition: StdGPUHashBackend.h:165
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
Definition: StdGPUHashBackend.h:298
InternalStdGPUHashBackend< Key, Hash, Eq > GetImpl() const
Definition: StdGPUHashBackend.h:140
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: StdGPUHashBackend.h:195
void Free()
Definition: StdGPUHashBackend.h:411
void Insert(const void *input_keys, const std::vector< const void * > &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: StdGPUHashBackend.h:358
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
Definition: StdGPUHashBackend.h:292
InternalStdGPUHashBackend< Key, Hash, Eq > impl_
Definition: StdGPUHashBackend.h:148
void Reserve(int64_t capacity) override
Definition: StdGPUHashBackend.h:281
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
Definition: StdGPUHashBackend.h:286
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: StdGPUHashBackend.h:262
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
Definition: StdGPUHashBackend.h:171
void Allocate(int64_t capacity)
Definition: StdGPUHashBackend.h:388
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: StdGPUHashBackend.h:150
void Clear() override
Clear stored map without reallocating memory.
Definition: StdGPUHashBackend.h:274
Definition: Tensor.h:32
T * GetDataPtr()
Definition: Tensor.h:1143
int count
Definition: FilePCD.cpp:42
void Synchronize()
Definition: CUDAUtils.cpp:58
__global__ void STDGPUFindKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:178
uint32_t buf_index_t
Definition: HashBackendBuffer.h:44
__global__ void STDGPUEraseKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:211
const Dtype Int32
Definition: Dtype.cpp:46
stdgpu::unordered_map< Key, buf_index_t, Hash, Eq, InternalStdGPUHashBackendAllocator< Key > > InternalStdGPUHashBackend
Definition: StdGPUHashBackend.h:102
__global__ void STDGPUInsertKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: StdGPUHashBackend.h:305
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:548
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle image_handle timestamp_usec white_balance image_handle k4a_device_configuration_t config device_handle char size_t serial_number_size bool int32_t int32_t int32_t int32_t k4a_color_control_mode_t default_mode value const const k4a_calibration_t calibration char size_t
Definition: K4aPlugin.cpp:719
Definition: PinholeCameraIntrinsic.cpp:16
Definition: StdGPUHashBackend.h:254
OPEN3D_HOST_DEVICE buf_index_t operator()(const stdgpu::pair< Key, buf_index_t > &x) const
Definition: StdGPUHashBackend.h:256