Open3D (C++ API)  0.12.0
InternalNodeManager.h
Go to the documentation of this file.
1 // ----------------------------------------------------------------------------
2 // - Open3D: www.open3d.org -
3 // ----------------------------------------------------------------------------
4 // The MIT License (MIT)
5 //
6 // Copyright (c) 2018 www.open3d.org
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 // IN THE SOFTWARE.
25 // ----------------------------------------------------------------------------
26 
27 // Copyright 2019 Saman Ashkiani
28 // Rewritten by Wei Dong 2019 - 2020
29 // Licensed under the Apache License, Version 2.0 (the "License");
30 // you may not use this file except in compliance with the License.
31 // You may obtain a copy of the License at
32 //
33 // http://www.apache.org/licenses/LICENSE-2.0
34 //
35 // Unless required by applicable law or agreed to in writing, software
36 // distributed under the License is distributed on an "AS IS" BASIS,
37 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
38 // implied. See the License for the specific language governing permissions
39 // and limitations under the License.
40 
41 #pragma once
42 
43 #include <thrust/device_vector.h>
44 
45 #include <cassert>
46 #include <memory>
47 #include <random>
48 
49 #include "open3d/core/CUDAUtils.h"
52 
53 namespace open3d {
54 namespace core {
55 
58 class Slab {
59 public:
62  addr_t kv_pair_ptrs[kWarpSize - 1];
65 };
66 
68 public:
70  : super_blocks_(nullptr),
71  hash_coef_(0),
72  num_attempts_(0),
73  memory_block_index_(0),
74  super_block_index_(0) {}
75 
76  __device__ __forceinline__ uint32_t* get_unit_ptr_from_slab(
77  const addr_t& next_slab_ptr, const uint32_t& lane_id) {
78  return super_blocks_ + addressDecoder(next_slab_ptr) + lane_id;
79  }
80  __device__ __forceinline__ uint32_t* get_ptr_for_bitmap(
81  const uint32_t super_block_idx, const uint32_t bitmap_idx) {
82  return super_blocks_ + super_block_idx * kUIntsPerSuperBlock +
83  bitmap_idx;
84  }
85 
86  // Objective: each warp selects its own memory_block warp allocator.
87  __device__ void Init(uint32_t& tid, uint32_t& lane_id) {
88  // Hashing the memory block to be used.
89  createMemBlockIndex(tid >> 5);
90 
91  // Loading the assigned memory block.
92  memory_block_bitmap_ =
93  super_blocks_[super_block_index_ * kUIntsPerSuperBlock +
94  memory_block_index_ * kSlabsPerBlock + lane_id];
95  }
96 
97  __device__ uint32_t WarpAllocate(const uint32_t& lane_id) {
98  // Try and allocate a new memory units within the memory_block memory
99  // block if it returns 0xFFFFFFFF, then there was not any empty memory
100  // unit a new memory_block block should be chosen, and repeat again
101  // allocated result: 5 bits: super_block_index
102  // 17 bits: memory block index
103  // 5 bits: memory unit index (hi-bits of 10bit)
104  // 5 bits: memory unit index (lo-bits of 10bit)
105  int empty_lane = -1;
106  uint32_t free_lane;
107  uint32_t read_bitmap = memory_block_bitmap_;
108  uint32_t allocated_result = kNotFoundFlag;
109  // Works as long as <31 bit are used in the allocated_result
110  // in other words, if there are 32 super blocks and at most 64k blocks
111  // per super block.
112 
113  while (allocated_result == kNotFoundFlag) {
114  empty_lane = __ffs(~memory_block_bitmap_) - 1;
115  free_lane = __ballot_sync(kSyncLanesMask, empty_lane >= 0);
116  if (free_lane == 0) {
117  // all bitmaps are full: need to be rehashed again.
118  updateMemBlockIndex((threadIdx.x + blockIdx.x * blockDim.x) >>
119  5);
120  read_bitmap = memory_block_bitmap_;
121  continue;
122  }
123  uint32_t src_lane = __ffs(free_lane) - 1;
124  if (src_lane == lane_id) {
125  read_bitmap = atomicCAS(
126  super_blocks_ +
127  super_block_index_ * kUIntsPerSuperBlock +
128  memory_block_index_ * kSlabsPerBlock + lane_id,
129  memory_block_bitmap_,
130  memory_block_bitmap_ | (1 << empty_lane));
131  if (read_bitmap == memory_block_bitmap_) {
132  // Successful attempt.
133  memory_block_bitmap_ |= (1 << empty_lane);
134  allocated_result =
135  (super_block_index_ << kSuperBlockMaskBits) |
136  (memory_block_index_ << kBlockMaskBits) |
137  (lane_id << kSlabMaskBits) | empty_lane;
138  } else {
139  // Not successful: updating the current bitmap.
140  memory_block_bitmap_ = read_bitmap;
141  }
142  }
143  // Asking for the allocated result.
144  allocated_result =
145  __shfl_sync(kSyncLanesMask, allocated_result, src_lane);
146  }
147  return allocated_result;
148  }
149 
150  // This function, frees a recently allocated memory unit by a single thread.
151  // Since it is untouched, there shouldn't be any worries for the actual
152  // memory contents to be reset again.
153  __device__ void FreeUntouched(addr_t ptr) {
154  atomicAnd(super_blocks_ +
155  getSuperBlockIndex(ptr) * kUIntsPerSuperBlock +
156  getMemBlockIndex(ptr) * kSlabsPerBlock +
157  (getMemUnitIndex(ptr) >> 5),
158  ~(1 << (getMemUnitIndex(ptr) & 0x1F)));
159  }
160 
161 private:
162  __device__ __host__ __forceinline__ uint32_t
163  getSuperBlockIndex(addr_t address) const {
164  return address >> kSuperBlockMaskBits;
165  }
166  __device__ __host__ __forceinline__ uint32_t
167  getMemBlockIndex(addr_t address) const {
168  return ((address >> kBlockMaskBits) & 0x1FFFF);
169  }
170  __device__ __host__ __forceinline__ addr_t
171  getMemBlockAddress(addr_t address) const {
172  return (kBitmapsPerSuperBlock +
173  getMemBlockIndex(address) * kUIntsPerBlock);
174  }
175  __device__ __host__ __forceinline__ uint32_t
176  getMemUnitIndex(addr_t address) const {
177  return address & 0x3FF;
178  }
179  __device__ __host__ __forceinline__ addr_t
180  getMemUnitAddress(addr_t address) {
181  return getMemUnitIndex(address) * kWarpSize;
182  }
183 
184  // Called at the beginning of the kernel.
185  __device__ void createMemBlockIndex(uint32_t global_warp_id) {
186  super_block_index_ = global_warp_id % kSuperBlocks;
187  memory_block_index_ = (hash_coef_ * global_warp_id) >>
188  (32 - kBlocksPerSuperBlockInBits);
189  }
190 
191  // Called when the allocator fails to find an empty unit to allocate.
192  __device__ void updateMemBlockIndex(uint32_t global_warp_id) {
193  num_attempts_++;
194  super_block_index_++;
195  super_block_index_ =
196  (super_block_index_ == kSuperBlocks) ? 0 : super_block_index_;
197  memory_block_index_ = (hash_coef_ * (global_warp_id + num_attempts_)) >>
198  (32 - kBlocksPerSuperBlockInBits);
199  // Loading the assigned memory block.
200  memory_block_bitmap_ =
201  *((super_blocks_ + super_block_index_ * kUIntsPerSuperBlock) +
202  memory_block_index_ * kSlabsPerBlock + (threadIdx.x & 0x1f));
203  }
204 
205  __host__ __device__ addr_t addressDecoder(addr_t address_ptr_index) {
206  return getSuperBlockIndex(address_ptr_index) * kUIntsPerSuperBlock +
207  getMemBlockAddress(address_ptr_index) +
208  getMemUnitIndex(address_ptr_index) * kWarpSize;
209  }
210 
211  __host__ __device__ void print_address(addr_t address_ptr_index) {
212  printf("Super block Index: %d, Memory block index: %d, Memory unit "
213  "index: "
214  "%d\n",
215  getSuperBlockIndex(address_ptr_index),
216  getMemBlockIndex(address_ptr_index),
217  getMemUnitIndex(address_ptr_index));
218  }
219 
220 public:
224  uint32_t hash_coef_; // A random 32-bit.
225 
226 private:
228  uint32_t num_attempts_;
229  uint32_t memory_block_index_;
230  uint32_t memory_block_bitmap_;
231  uint32_t super_block_index_;
232 };
233 
234 __global__ void CountSlabsPerSuperblockKernel(
235  InternalNodeManagerContext context, uint32_t* slabs_per_superblock);
236 
238 public:
239  InternalNodeManager(const Device& device) : device_(device) {
241  std::mt19937 rng(time(0));
242  gpu_context_.hash_coef_ = rng();
243 
246  gpu_context_.super_blocks_ =
247  static_cast<uint32_t*>(MemoryManager::Malloc(
248  kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t),
249  device_));
250 
251  OPEN3D_CUDA_CHECK(cudaMemset(
252  gpu_context_.super_blocks_, 0xFF,
253  kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t)));
254 
255  for (uint32_t i = 0; i < kSuperBlocks; i++) {
256  // setting bitmaps into zeros:
257  OPEN3D_CUDA_CHECK(cudaMemset(
258  gpu_context_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00,
259  kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t)));
260  }
261  }
262 
264  MemoryManager::Free(gpu_context_.super_blocks_, device_);
265  }
266 
267  std::vector<int> CountSlabsPerSuperblock() {
268  const uint32_t num_super_blocks = kSuperBlocks;
269 
270  thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);
271  thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
272  0);
273 
274  // Counting total number of allocated memory units.
275  int num_mem_units = kBlocksPerSuperBlock * 32;
276  int num_cuda_blocks =
277  (num_mem_units + kThreadsPerBlock - 1) / kThreadsPerBlock;
278  CountSlabsPerSuperblockKernel<<<num_cuda_blocks, kThreadsPerBlock>>>(
279  gpu_context_,
280  thrust::raw_pointer_cast(slabs_per_superblock.data()));
281  OPEN3D_CUDA_CHECK(cudaDeviceSynchronize());
282  OPEN3D_CUDA_CHECK(cudaGetLastError());
283 
284  std::vector<int> result(num_super_blocks);
285  thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
286  result.begin());
287 
288  return std::move(result);
289  }
290 
291 public:
294 };
295 
297  InternalNodeManagerContext context, uint32_t* slabs_per_superblock) {
298  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
299 
300  int num_bitmaps = kBlocksPerSuperBlock * 32;
301  if (tid >= num_bitmaps) {
302  return;
303  }
304 
305  for (uint32_t i = 0; i < kSuperBlocks; i++) {
306  uint32_t read_bitmap = *(context.get_ptr_for_bitmap(i, tid));
307  atomicAdd(&slabs_per_superblock[i], __popc(read_bitmap));
308  }
309 }
310 } // namespace core
311 } // namespace open3d
uint32_t * super_blocks_
A pointer to each super-block.
Definition: InternalNodeManager.h:222
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: InternalNodeManager.h:97
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:557
static void Free(void *ptr, const Device &device)
Definition: MemoryManager.cpp:44
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:57
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:40
__device__ void FreeUntouched(addr_t ptr)
Definition: InternalNodeManager.h:153
Device device_
Definition: InternalNodeManager.h:293
addr_t next_slab_ptr
An internal ptr managed by InternalNodeManager.
Definition: InternalNodeManager.h:64
std::vector< int > CountSlabsPerSuperblock()
Definition: InternalNodeManager.h:267
InternalNodeManager(const Device &device)
Definition: InternalNodeManager.h:239
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: InternalNodeManager.h:87
Definition: Device.h:39
Definition: InternalNodeManager.h:67
__device__ __forceinline__ uint32_t * get_ptr_for_bitmap(const uint32_t super_block_idx, const uint32_t bitmap_idx)
Definition: InternalNodeManager.h:80
uint32_t hash_coef_
hash_coef (register): used as (16 bits, 16 bits) for hashing.
Definition: InternalNodeManager.h:224
InternalNodeManagerContext gpu_context_
Definition: InternalNodeManager.h:292
Definition: PinholeCameraIntrinsic.cpp:35
uint32_t addr_t
Definition: HashmapBuffer.h:58
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const addr_t &next_slab_ptr, const uint32_t &lane_id)
Definition: InternalNodeManager.h:76
Definition: InternalNodeManager.h:58
Common CUDA utilities.
~InternalNodeManager()
Definition: InternalNodeManager.h:263
addr_t kv_pair_ptrs[kWarpSize - 1]
Definition: InternalNodeManager.h:62
InternalNodeManagerContext()
Definition: InternalNodeManager.h:69
__global__ void CountSlabsPerSuperblockKernel(InternalNodeManagerContext context, uint32_t *slabs_per_superblock)
Definition: InternalNodeManager.h:296
Definition: InternalNodeManager.h:237