Open3D (C++ API)  0.11.0
HashmapCUDA.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
27 // Copyright 2019 Saman Ashkiani
28 // Rewritten by Wei Dong 2019 - 2020
29 // Licensed under the Apache License, Version 2.0 (the "License");
30 // you may not use this file except in compliance with the License.
31 // You may obtain a copy of the License at
32 //
33 // http://www.apache.org/licenses/LICENSE-2.0
34 //
35 // Unless required by applicable law or agreed to in writing, software
36 // distributed under the License is distributed on an "AS IS" BASIS,
37 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
38 // implied. See the License for the specific language governing permissions
39 // and limitations under the License.
40 
41 #pragma once
42 
43 #include <cassert>
44 #include <memory>
45 
46 #include "open3d/core/CUDAUtils.h"
50 
51 namespace open3d {
52 namespace core {
53 template <typename Hash, typename KeyEq>
54 class CUDAHashmap : public DeviceHashmap<Hash, KeyEq> {
55 public:
56  CUDAHashmap(size_t init_buckets,
57  size_t init_capacity,
58  size_t dsize_key,
59  size_t dsize_value,
60  const Device& device);
61 
62  ~CUDAHashmap();
63 
64  void Rehash(size_t buckets) override;
65 
66  void Insert(const void* input_keys,
67  const void* input_values,
68  iterator_t* output_iterators,
69  bool* output_masks,
70  size_t count) override;
71 
72  void Activate(const void* input_keys,
73  iterator_t* output_iterators,
74  bool* output_masks,
75  size_t count) override;
76 
77  void Find(const void* input_keys,
78  iterator_t* output_iterators,
79  bool* output_masks,
80  size_t count) override;
81 
82  void Erase(const void* input_keys,
83  bool* output_masks,
84  size_t count) override;
85 
86  size_t GetIterators(iterator_t* output_iterators) override;
87 
88  void UnpackIterators(const iterator_t* input_iterators,
89  const bool* input_masks,
90  void* output_keys,
91  void* output_values,
92  size_t count) override;
93 
94  void AssignIterators(iterator_t* input_iterators,
95  const bool* input_masks,
96  const void* input_values,
97  size_t count) override;
98 
99  std::vector<size_t> BucketSizes() const override;
100 
101  float LoadFactor() const override;
102 
103  size_t Size() const override;
104 
105 protected:
109 
110  std::shared_ptr<InternalKvPairManager> kv_mgr_;
111  std::shared_ptr<InternalNodeManager> node_mgr_;
112 
115  void InsertImpl(const void* input_keys,
116  const void* input_values,
117  iterator_t* output_iterators,
118  bool* output_masks,
119  size_t count);
120 
121  void Allocate(size_t bucket_count, size_t capacity);
122 };
123 
124 template <typename Hash, typename KeyEq>
126  size_t init_capacity,
127  size_t dsize_key,
128  size_t dsize_value,
129  const Device& device)
130  : DeviceHashmap<Hash, KeyEq>(
131  init_buckets, init_capacity, dsize_key, dsize_value, device) {
132  Allocate(init_buckets, init_capacity);
133 }
134 
135 template <typename Hash, typename KeyEq>
137  MemoryManager::Free(gpu_context_.bucket_list_head_, this->device_);
138 }
139 
140 template <typename Hash, typename KeyEq>
141 void CUDAHashmap<Hash, KeyEq>::Rehash(size_t buckets) {
142  size_t iterator_count = Size();
143 
144  void* output_keys = nullptr;
145  void* output_values = nullptr;
146  iterator_t* output_iterators = nullptr;
147  bool* output_masks = nullptr;
148 
149  if (iterator_count > 0) {
150  output_keys = MemoryManager::Malloc(this->dsize_key_ * iterator_count,
151  this->device_);
152  output_values = MemoryManager::Malloc(
153  this->dsize_value_ * iterator_count, this->device_);
154  output_iterators = static_cast<iterator_t*>(MemoryManager::Malloc(
155  sizeof(iterator_t) * iterator_count, this->device_));
156  output_masks = static_cast<bool*>(MemoryManager::Malloc(
157  sizeof(bool) * iterator_count, this->device_));
158 
159  GetIterators(output_iterators);
160  UnpackIterators(output_iterators, /* masks = */ nullptr, output_keys,
161  output_values, iterator_count);
162  }
163 
164  float avg_capacity_per_bucket =
165  float(this->capacity_) / float(this->bucket_count_);
166  MemoryManager::Free(gpu_context_.bucket_list_head_, this->device_);
167  Allocate(buckets, size_t(std::ceil(buckets * avg_capacity_per_bucket)));
168 
169  if (iterator_count > 0) {
170  InsertImpl(output_keys, output_values, output_iterators, output_masks,
171  iterator_count);
172 
173  MemoryManager::Free(output_keys, this->device_);
174  MemoryManager::Free(output_values, this->device_);
175  MemoryManager::Free(output_masks, this->device_);
176  MemoryManager::Free(output_iterators, this->device_);
177  }
178 }
179 
180 template <typename Hash, typename KeyEq>
181 void CUDAHashmap<Hash, KeyEq>::Insert(const void* input_keys,
182  const void* input_values,
183  iterator_t* output_iterators,
184  bool* output_masks,
185  size_t count) {
186  size_t new_size = Size() + count;
187  if (new_size > this->capacity_) {
188  float avg_capacity_per_bucket =
189  float(this->capacity_) / float(this->bucket_count_);
190  size_t expected_buckets =
191  std::max(this->bucket_count_ * 2,
192  size_t(std::ceil(new_size / avg_capacity_per_bucket)));
193  Rehash(expected_buckets);
194  }
195 
196  InsertImpl(input_keys, input_values, output_iterators, output_masks, count);
197 }
198 
199 template <typename Hash, typename KeyEq>
200 void CUDAHashmap<Hash, KeyEq>::Activate(const void* input_keys,
201  iterator_t* output_iterators,
202  bool* output_masks,
203  size_t count) {
204  size_t new_size = Size() + count;
205  if (new_size > this->capacity_) {
206  float avg_capacity_per_bucket =
207  float(this->capacity_) / float(this->bucket_count_);
208  size_t expected_buckets =
209  std::max(this->bucket_count_ * 2,
210  size_t(std::ceil(new_size / avg_capacity_per_bucket)));
211  Rehash(expected_buckets);
212  }
213 
214  InsertImpl(input_keys, nullptr, output_iterators, output_masks, count);
215 }
216 
217 template <typename Hash, typename KeyEq>
218 void CUDAHashmap<Hash, KeyEq>::Find(const void* input_keys,
219  iterator_t* output_iterators,
220  bool* output_masks,
221  size_t count) {
222  if (count == 0) return;
223 
224  OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
225 
226  const size_t num_blocks = (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
227  FindKernel<<<num_blocks, kThreadsPerBlock>>>(
228  gpu_context_, input_keys, output_iterators, output_masks, count);
229  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
230  OPEN3D_CUDA_CHECK(cudaGetLastError());
231 }
232 
233 template <typename Hash, typename KeyEq>
234 void CUDAHashmap<Hash, KeyEq>::Erase(const void* input_keys,
235  bool* output_masks,
236  size_t count) {
237  if (count == 0) return;
238 
239  OPEN3D_CUDA_CHECK(cudaMemset(output_masks, 0, sizeof(bool) * count));
240  auto iterator_addrs = static_cast<addr_t*>(
241  MemoryManager::Malloc(sizeof(addr_t) * count, this->device_));
242 
243  const size_t num_blocks = (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
244  EraseKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
245  gpu_context_, input_keys, iterator_addrs, output_masks, count);
246  EraseKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
247  gpu_context_, iterator_addrs, output_masks, count);
248  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
249  OPEN3D_CUDA_CHECK(cudaGetLastError());
250 
251  MemoryManager::Free(iterator_addrs, this->device_);
252 }
253 
254 template <typename Hash, typename KeyEq>
256  uint32_t* iterator_count = static_cast<uint32_t*>(
257  MemoryManager::Malloc(sizeof(uint32_t), this->device_));
258  cudaMemset(iterator_count, 0, sizeof(uint32_t));
259 
260  const size_t num_blocks =
261  (gpu_context_.bucket_count_ * kWarpSize + kThreadsPerBlock - 1) /
262  kThreadsPerBlock;
263  GetIteratorsKernel<<<num_blocks, kThreadsPerBlock>>>(
264  gpu_context_, output_iterators, iterator_count);
265  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
266  OPEN3D_CUDA_CHECK(cudaGetLastError());
267 
268  uint32_t ret;
269  MemoryManager::MemcpyToHost(&ret, iterator_count, this->device_,
270  sizeof(uint32_t));
271  MemoryManager::Free(iterator_count, this->device_);
272 
273  return static_cast<size_t>(ret);
274 }
275 
276 template <typename Hash, typename KeyEq>
278  const iterator_t* input_iterators,
279  const bool* input_masks,
280  void* output_keys,
281  void* output_values,
282  size_t iterator_count) {
283  if (iterator_count == 0) return;
284 
285  const size_t num_blocks =
286  (iterator_count + kThreadsPerBlock - 1) / kThreadsPerBlock;
287  UnpackIteratorsKernel<<<num_blocks, kThreadsPerBlock>>>(
288  input_iterators, input_masks, output_keys, output_values,
289  this->dsize_key_, this->dsize_value_, iterator_count);
290  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
291  OPEN3D_CUDA_CHECK(cudaGetLastError());
292 }
293 
294 template <typename Hash, typename KeyEq>
296  const bool* input_masks,
297  const void* input_values,
298  size_t iterator_count) {
299  if (iterator_count == 0) return;
300 
301  const size_t num_blocks =
302  (iterator_count + kThreadsPerBlock - 1) / kThreadsPerBlock;
303  AssignIteratorsKernel<<<num_blocks, kThreadsPerBlock>>>(
304  input_iterators, input_masks, input_values, this->dsize_value_,
305  iterator_count);
306  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
307  OPEN3D_CUDA_CHECK(cudaGetLastError());
308 }
309 
310 template <typename Hash, typename KeyEq>
311 std::vector<size_t> CUDAHashmap<Hash, KeyEq>::BucketSizes() const {
312  thrust::device_vector<size_t> elems_per_bucket(gpu_context_.bucket_count_);
313  thrust::fill(elems_per_bucket.begin(), elems_per_bucket.end(), 0);
314 
315  const size_t num_blocks =
316  (gpu_context_.capacity_ + kThreadsPerBlock - 1) / kThreadsPerBlock;
317  CountElemsPerBucketKernel<<<num_blocks, kThreadsPerBlock>>>(
318  gpu_context_, thrust::raw_pointer_cast(elems_per_bucket.data()));
319  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
320  OPEN3D_CUDA_CHECK(cudaGetLastError());
321 
322  std::vector<size_t> result(gpu_context_.bucket_count_);
323  thrust::copy(elems_per_bucket.begin(), elems_per_bucket.end(),
324  result.begin());
325  return std::move(result);
326 }
327 
328 template <typename Hash, typename KeyEq>
330  return float(Size()) / float(this->bucket_count_);
331 }
332 
333 template <typename Hash, typename KeyEq>
335  return *thrust::device_ptr<int>(gpu_context_.kv_mgr_ctx_.heap_counter_);
336 }
337 
338 template <typename Hash, typename KeyEq>
339 void CUDAHashmap<Hash, KeyEq>::InsertImpl(const void* input_keys,
340  const void* input_values,
341  iterator_t* output_iterators,
342  bool* output_masks,
343  size_t count) {
344  if (count == 0) return;
345  auto iterator_addrs = static_cast<addr_t*>(
346  MemoryManager::Malloc(sizeof(addr_t) * count, this->device_));
347 
350  int prev_heap_counter =
351  *thrust::device_ptr<int>(gpu_context_.kv_mgr_ctx_.heap_counter_);
352  *thrust::device_ptr<int>(gpu_context_.kv_mgr_ctx_.heap_counter_) =
353  prev_heap_counter + count;
354 
355  const size_t num_blocks = (count + kThreadsPerBlock - 1) / kThreadsPerBlock;
356  InsertKernelPass0<<<num_blocks, kThreadsPerBlock>>>(
357  gpu_context_, input_keys, iterator_addrs, prev_heap_counter, count);
358  InsertKernelPass1<<<num_blocks, kThreadsPerBlock>>>(
359  gpu_context_, input_keys, iterator_addrs, output_masks, count);
360  InsertKernelPass2<<<num_blocks, kThreadsPerBlock>>>(
361  gpu_context_, input_values, iterator_addrs, output_iterators,
362  output_masks, count);
363  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
364  OPEN3D_CUDA_CHECK(cudaGetLastError());
365 
366  MemoryManager::Free(iterator_addrs, this->device_);
367 }
368 
369 template <typename Hash, typename KeyEq>
370 void CUDAHashmap<Hash, KeyEq>::Allocate(size_t bucket_count, size_t capacity) {
371  this->bucket_count_ = bucket_count;
372  this->capacity_ = capacity;
373 
374  // Allocate buffer for key-values.
375  kv_mgr_ = std::make_shared<InternalKvPairManager>(
376  this->capacity_, this->dsize_key_, this->dsize_value_,
377  this->device_);
378 
379  // Allocate buffer for linked list nodes.
380  node_mgr_ = std::make_shared<InternalNodeManager>(this->device_);
381 
382  // Allocate linked list heads.
383  gpu_context_.bucket_list_head_ = static_cast<Slab*>(MemoryManager::Malloc(
384  sizeof(Slab) * this->bucket_count_, this->device_));
385  OPEN3D_CUDA_CHECK(cudaMemset(gpu_context_.bucket_list_head_, 0xFF,
386  sizeof(Slab) * this->bucket_count_));
387 
388  gpu_context_.Setup(this->bucket_count_, this->capacity_, this->dsize_key_,
389  this->dsize_value_, node_mgr_->gpu_context_,
390  kv_mgr_->gpu_context_);
391 }
392 
393 } // namespace core
394 } // namespace open3d
~CUDAHashmap()
Definition: HashmapCUDA.h:136
size_t Size() const override
Definition: HashmapCUDA.h:334
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:554
void Erase(const void *input_keys, bool *output_masks, size_t count) override
Parallel erase a contiguous array of keys.
Definition: HashmapCUDA.h:234
static void Free(void *ptr, const Device &device)
Definition: MemoryManager.cpp:44
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:57
std::shared_ptr< InternalNodeManager > node_mgr_
Definition: HashmapCUDA.h:111
void Insert(const void *input_keys, const void *input_values, iterator_t *output_iterators, bool *output_masks, size_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: HashmapCUDA.h:181
size_t bucket_count_
Definition: DeviceHashmap.h:172
static void MemcpyToHost(void *host_ptr, const void *src_ptr, const Device &src_device, size_t num_bytes)
Same as Memcpy, but with host (CPU:0) as default dst_device.
Definition: MemoryManager.cpp:88
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:40
void Rehash(size_t buckets) override
Definition: HashmapCUDA.h:141
Definition: HashmapCUDAImpl.h:36
std::vector< size_t > BucketSizes() const override
Definition: HashmapCUDA.h:311
Base class: shared interface.
Definition: DeviceHashmap.h:91
void Activate(const void *input_keys, iterator_t *output_iterators, bool *output_masks, size_t count) override
Definition: HashmapCUDA.h:200
void Allocate(size_t bucket_count, size_t capacity)
Definition: HashmapCUDA.h:370
size_t capacity_
Definition: DeviceHashmap.h:173
Definition: Device.h:39
Definition: Traits.h:51
Definition: HashmapCUDA.h:54
int count
Definition: FilePCD.cpp:61
std::shared_ptr< InternalKvPairManager > kv_mgr_
Definition: HashmapCUDA.h:110
Device device_
Definition: DeviceHashmap.h:176
void InsertImpl(const void *input_keys, const void *input_values, iterator_t *output_iterators, bool *output_masks, size_t count)
Definition: HashmapCUDA.h:339
Definition: PinholeCameraIntrinsic.cpp:35
void Find(const void *input_keys, iterator_t *output_iterators, bool *output_masks, size_t count) override
Parallel find a contiguous array of keys.
Definition: HashmapCUDA.h:218
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle float
Definition: K4aPlugin.cpp:462
uint32_t addr_t
Definition: Traits.h:49
CUDAHashmap(size_t init_buckets, size_t init_capacity, size_t dsize_key, size_t dsize_value, const Device &device)
Definition: HashmapCUDA.h:125
float LoadFactor() const override
Return size / bucket_count.
Definition: HashmapCUDA.h:329
size_t GetIterators(iterator_t *output_iterators) override
Parallel collect all iterators in the hash table.
Definition: HashmapCUDA.h:255
Definition: InternalNodeManager.h:65
CUDAHashmapImplContext< Hash, KeyEq > gpu_context_
Definition: HashmapCUDA.h:108
size_t dsize_key_
Definition: DeviceHashmap.h:174
Common CUDA utilities.
void AssignIterators(iterator_t *input_iterators, const bool *input_masks, const void *input_values, size_t count) override
Parallel assign iterators in-place with associated values.
Definition: HashmapCUDA.h:295
void UnpackIterators(const iterator_t *input_iterators, const bool *input_masks, void *output_keys, void *output_values, size_t count) override
Parallel unpack iterators to contiguous arrays of keys and/or values.
Definition: HashmapCUDA.h:277
size_t dsize_value_
Definition: DeviceHashmap.h:175