Open3D (C++ API)  0.12.0
HashmapCUDA.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
27 // Copyright 2019 Saman Ashkiani
28 // Rewritten by Wei Dong 2019 - 2020
29 // Licensed under the Apache License, Version 2.0 (the "License");
30 // you may not use this file except in compliance with the License.
31 // You may obtain a copy of the License at
32 //
33 // http://www.apache.org/licenses/LICENSE-2.0
34 //
35 // Unless required by applicable law or agreed to in writing, software
36 // distributed under the License is distributed on an "AS IS" BASIS,
37 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
38 // implied. See the License for the specific language governing permissions
39 // and limitations under the License.
40 
41 #pragma once
42 
43 #include <cassert>
44 #include <memory>
45 
46 #include "open3d/core/CUDAUtils.h"
49 
50 namespace open3d {
51 namespace core {
52 template <typename Hash, typename KeyEq>
53 class CUDAHashmap : public DeviceHashmap<Hash, KeyEq> {
54 public:
55  CUDAHashmap(int64_t init_buckets,
56  int64_t init_capacity,
57  int64_t dsize_key,
58  int64_t dsize_value,
59  const Device& device);
60 
61  ~CUDAHashmap();
62 
63  void Rehash(int64_t buckets) override;
64 
65  void Insert(const void* input_keys,
66  const void* input_values,
67  addr_t* output_addrs,
68  bool* output_masks,
69  int64_t count) override;
70 
71  void Activate(const void* input_keys,
72  addr_t* output_addrs,
73  bool* output_masks,
74  int64_t count) override;
75 
76  void Find(const void* input_keys,
77  addr_t* output_addrs,
78  bool* output_masks,
79  int64_t count) override;
80 
81  void Erase(const void* input_keys,
82  bool* output_masks,
83  int64_t count) override;
84 
85  int64_t GetActiveIndices(addr_t* output_indices) override;
86 
87  int64_t Size() const override;
88 
89  std::vector<int64_t> BucketSizes() const override;
90  float LoadFactor() const override;
91 
92 protected:
96 
97  CUDAHashmapBufferContext buffer_ctx_;
98  std::shared_ptr<InternalNodeManager> node_mgr_;
99 
102  void InsertImpl(const void* input_keys,
103  const void* input_values,
104  addr_t* output_addrs,
105  bool* output_masks,
106  int64_t count);
107 
108  void Allocate(int64_t bucket_count, int64_t capacity);
109  void Free();
110 };
111 
112 template <typename Hash, typename KeyEq>
114  int64_t init_capacity,
115  int64_t dsize_key,
116  int64_t dsize_value,
117  const Device& device)
118  : DeviceHashmap<Hash, KeyEq>(
119  init_buckets, init_capacity, dsize_key, dsize_value, device) {
120  Allocate(init_buckets, init_capacity);
121 }
122 
123 template <typename Hash, typename KeyEq>
125  Free();
126 }
127 
128 template <typename Hash, typename KeyEq>
129 void CUDAHashmap<Hash, KeyEq>::Rehash(int64_t buckets) {
130  int64_t iterator_count = Size();
131 
132  Tensor active_keys;
133  Tensor active_values;
134 
135  if (iterator_count > 0) {
136  Tensor active_addrs =
137  Tensor({iterator_count}, Dtype::Int32, this->device_);
138  GetActiveIndices(static_cast<addr_t*>(active_addrs.GetDataPtr()));
139 
140  Tensor active_indices = active_addrs.To(Dtype::Int64);
141  active_keys = this->buffer_->GetKeyBuffer().IndexGet({active_indices});
142  active_values =
143  this->buffer_->GetValueBuffer().IndexGet({active_indices});
144  }
145 
146  float avg_capacity_per_bucket =
147  float(this->capacity_) / float(this->bucket_count_);
148 
149  Free();
151 
152  Allocate(buckets, int64_t(std::ceil(buckets * avg_capacity_per_bucket)));
153 
154  if (iterator_count > 0) {
155  Tensor output_addrs({iterator_count}, Dtype::Int32, this->device_);
156  Tensor output_masks({iterator_count}, Dtype::Bool, this->device_);
157 
158  InsertImpl(active_keys.GetDataPtr(), active_values.GetDataPtr(),
159  static_cast<addr_t*>(output_addrs.GetDataPtr()),
160  static_cast<bool*>(output_masks.GetDataPtr()),
161  iterator_count);
162  }
164 }
165 
166 template <typename Hash, typename KeyEq>
167 void CUDAHashmap<Hash, KeyEq>::Insert(const void* input_keys,
168  const void* input_values,
169  addr_t* output_addrs,
170  bool* output_masks,
171  int64_t count) {
172  int64_t new_size = Size() + count;
173  if (new_size > this->capacity_) {
174  float avg_capacity_per_bucket =
175  float(this->capacity_) / float(this->bucket_count_);
176  int64_t expected_buckets = std::max(
177  int64_t(this->bucket_count_ * 2),
178  int64_t(std::ceil(new_size / avg_capacity_per_bucket)));
179  Rehash(expected_buckets);
180  }
181 
182  InsertImpl(input_keys, input_values, output_addrs, output_masks, count);
183 }
184 
185 template <typename Hash, typename KeyEq>
186 void CUDAHashmap<Hash, KeyEq>::Activate(const void* input_keys,
187  addr_t* output_addrs,
188  bool* output_masks,
189  int64_t count) {
190  int64_t new_size = Size() + count;
191  if (new_size > this->capacity_) {
192  float avg_capacity_per_bucket =
193  float(this->capacity_) / float(this->bucket_count_);
194  int64_t expected_buckets = std::max(
195  int64_t(this->bucket_count_ * 2),
196  int64_t(std::ceil(new_size / avg_capacity_per_bucket)));
197  Rehash(expected_buckets);
198  }
199 
200  InsertImpl(input_keys, nullptr, output_addrs, output_masks, count);
201 }
202 
203 template <typename Hash, typename KeyEq>
204 void CUDAHashmap<Hash, KeyEq>::Find(const void* input_keys,
205  addr_t* output_addrs,
206  bool* output_masks,
207  int64_t count) {
208  if (count == 0) return;
209 
210  OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
211 
212  const int64_t num_blocks =
213  (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
214  FindKernel<<<num_blocks, kThreadsPerBlock>>>(
215  gpu_context_, input_keys, output_addrs, output_masks, count);
216  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
217  OPEN3D_CUDA_CHECK(cudaGetLastError());
218 }
219 
220 template <typename Hash, typename KeyEq>
221 void CUDAHashmap<Hash, KeyEq>::Erase(const void* input_keys,
222  bool* output_masks,
223  int64_t count) {
224  if (count == 0) return;
225 
226  OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
227  auto iterator_addrs = static_cast<addr_t*>(
228  MemoryManager::Malloc(sizeof(addr_t) * count, this->device_));
229 
230  const int64_t num_blocks =
231  (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
232  EraseKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
233  gpu_context_, input_keys, iterator_addrs, output_masks, count);
234  EraseKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
235  gpu_context_, iterator_addrs, output_masks, count);
236  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
237  OPEN3D_CUDA_CHECK(cudaGetLastError());
238 
239  MemoryManager::Free(iterator_addrs, this->device_);
240 }
241 
242 template <typename Hash, typename KeyEq>
244  uint32_t* iterator_count = static_cast<uint32_t*>(
245  MemoryManager::Malloc(sizeof(uint32_t), this->device_));
246  cudaMemset(iterator_count, 0, sizeof(uint32_t));
247 
248  const int64_t num_blocks =
249  (gpu_context_.bucket_count_ * kWarpSize + kThreadsPerBlock - 1) /
250  kThreadsPerBlock;
251  GetActiveIndicesKernel<<<num_blocks, kThreadsPerBlock>>>(
252  gpu_context_, output_addrs, iterator_count);
253  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
254  OPEN3D_CUDA_CHECK(cudaGetLastError());
255 
256  uint32_t ret;
257  MemoryManager::MemcpyToHost(&ret, iterator_count, this->device_,
258  sizeof(uint32_t));
259  MemoryManager::Free(iterator_count, this->device_);
260 
261  return static_cast<int64_t>(ret);
262 }
263 
264 template <typename Hash, typename KeyEq>
266  return buffer_ctx_.HeapCounter(this->device_);
267 }
268 
269 template <typename Hash, typename KeyEq>
270 std::vector<int64_t> CUDAHashmap<Hash, KeyEq>::BucketSizes() const {
271  thrust::device_vector<int64_t> elems_per_bucket(gpu_context_.bucket_count_);
272  thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);
273 
274  const int64_t num_blocks =
275  (gpu_context_.capacity_ + kThreadsPerBlock - 1) / kThreadsPerBlock;
276  CountElemsPerBucketKernel<<<num_blocks, kThreadsPerBlock>>>(
277  gpu_context_, thrust::raw_pointer_cast(elems_per_bucket.data()));
278  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
279  OPEN3D_CUDA_CHECK(cudaGetLastError());
280 
281  std::vector<int64_t> result(gpu_context_.bucket_count_);
282  thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
283  result.begin());
284  return std::move(result);
285 }
286 
287 template <typename Hash, typename KeyEq>
289  return float(Size()) / float(this->bucket_count_);
290 }
291 
292 template <typename Hash, typename KeyEq>
293 void CUDAHashmap<Hash, KeyEq>::InsertImpl(const void* input_keys,
294  const void* input_values,
295  addr_t* output_addrs,
296  bool* output_masks,
297  int64_t count) {
298  if (count == 0) return;
299 
302  int prev_heap_counter = buffer_ctx_.HeapCounter(this->device_);
303  *thrust::device_ptr<int>(gpu_context_.kv_mgr_ctx_.heap_counter_) =
304  prev_heap_counter + count;
305 
306  const int64_t num_blocks =
307  (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
308  InsertKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
309  gpu_context_, input_keys, output_addrs, prev_heap_counter, count);
310  InsertKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
311  gpu_context_, input_keys, output_addrs, output_masks, count);
312  InsertKernelPass2<<<num_blocks, kThreadsPerBlock>>>(
313  gpu_context_, input_values, output_addrs, output_masks, count);
314  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
315  OPEN3D_CUDA_CHECK(cudaGetLastError());
316 }
317 
318 template <typename Hash, typename KeyEq>
319 void CUDAHashmap<Hash, KeyEq>::Allocate(int64_t bucket_count,
320  int64_t capacity) {
321  this->bucket_count_ = bucket_count;
322  this->capacity_ = capacity;
323 
324  // Allocate buffer for key values.
325  this->buffer_ =
326  std::make_shared<HashmapBuffer>(this->capacity_, this->dsize_key_,
327  this->dsize_value_, this->device_);
328  buffer_ctx_.HostAllocate(this->device_);
329  buffer_ctx_.Setup(this->capacity_, this->dsize_key_, this->dsize_value_,
330  this->buffer_->GetKeyBuffer(),
331  this->buffer_->GetValueBuffer(),
332  this->buffer_->GetHeap());
333  buffer_ctx_.Reset(this->device_);
334 
335  // Allocate buffer for linked list nodes.
336  node_mgr_ = std::make_shared<InternalNodeManager>(this->device_);
337 
338  // Allocate linked list heads.
339  gpu_context_.bucket_list_head_ = static_cast<Slab*>(MemoryManager::Malloc(
340  sizeof(Slab) * this->bucket_count_, this->device_));
341  OPEN3D_CUDA_CHECK(cudaMemset(gpu_context_.bucket_list_head_, 0xFF,
342  sizeof(Slab) * this->bucket_count_));
343 
344  gpu_context_.Setup(this->bucket_count_, this->capacity_, this->dsize_key_,
345  this->dsize_value_, node_mgr_->gpu_context_,
346  buffer_ctx_);
347 }
348 
349 template <typename Hash, typename KeyEq>
351  buffer_ctx_.HostFree(this->device_);
352  MemoryManager::Free(gpu_context_.bucket_list_head_, this->device_);
353 }
354 } // namespace core
355 } // namespace open3d
~CUDAHashmap()
Definition: HashmapCUDA.h:124
int64_t Size() const override
Definition: HashmapCUDA.h:265
void ReleaseCache()
Definition: CUDAUtils.cpp:55
void Free()
Definition: HashmapCUDA.h:350
int64_t dsize_value_
Definition: DeviceHashmap.h:175
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
void Allocate(int64_t bucket_count, int64_t capacity)
Definition: HashmapCUDA.h:319
void * GetDataPtr()
Definition: Tensor.h:961
static void Free(void *ptr, const Device &device)
Definition: MemoryManager.cpp:44
void Find(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: HashmapCUDA.h:204
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:57
std::shared_ptr< InternalNodeManager > node_mgr_
Definition: HashmapCUDA.h:98
static void MemcpyToHost(void *host_ptr, const void *src_ptr, const Device &src_device, size_t num_bytes)
Same as Memcpy, but with host (CPU:0) as default dst_device.
Definition: MemoryManager.cpp:88
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:40
CUDAHashmap(int64_t init_buckets, int64_t init_capacity, int64_t dsize_key, int64_t dsize_value, const Device &device)
Definition: HashmapCUDA.h:113
Definition: HashmapCUDAImpl.h:37
FN_SPECIFIERS MiniVec< float, N > ceil(const MiniVec< float, N > &a)
Definition: MiniVec.h:108
Base class: shared interface.
Definition: DeviceHashmap.h:101
int64_t capacity_
Definition: DeviceHashmap.h:173
static const Dtype Int32
Definition: Dtype.h:44
void Activate(const void *input_keys, addr_t *output_addrs, bool *output_masks, int64_t count) override
Definition: HashmapCUDA.h:186
void Rehash(int64_t buckets) override
Definition: HashmapCUDA.h:129
std::shared_ptr< HashmapBuffer > buffer_
Definition: DeviceHashmap.h:179
Tensor To(Dtype dtype, bool copy=false) const
Definition: Tensor.cpp:453
Definition: Device.h:39
void Insert(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: HashmapCUDA.h:167
CUDAHashmapBufferContext buffer_ctx_
Definition: HashmapCUDA.h:97
int64_t dsize_key_
Definition: DeviceHashmap.h:174
Definition: HashmapCUDA.h:53
int count
Definition: FilePCD.cpp:61
void InsertImpl(const void *input_keys, const void *input_values, addr_t *output_addrs, bool *output_masks, int64_t count)
Definition: HashmapCUDA.h:293
Device device_
Definition: DeviceHashmap.h:177
static const Dtype Int64
Definition: Dtype.h:45
Definition: PinholeCameraIntrinsic.cpp:35
Definition: Tensor.h:48
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: HashmapCUDA.h:221
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle float
Definition: K4aPlugin.cpp:465
uint32_t addr_t
Definition: HashmapBuffer.h:58
float LoadFactor() const override
Return size / bucket_count.
Definition: HashmapCUDA.h:288
std::vector< int64_t > BucketSizes() const override
Definition: HashmapCUDA.h:270
int64_t GetActiveIndices(addr_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: HashmapCUDA.h:243
Definition: InternalNodeManager.h:58
CUDAHashmapImplContext< Hash, KeyEq > gpu_context_
Definition: HashmapCUDA.h:95
static const Dtype Bool
Definition: Dtype.h:48
Common CUDA utilities.
int64_t bucket_count_
Definition: DeviceHashmap.h:172