24 #include <thrust/device_vector.h>
54 memory_block_index_(0),
55 super_block_index_(0) {}
59 return super_blocks_ + addressDecoder(next_slab_ptr) + lane_id;
63 return super_blocks_ + super_block_idx * kUIntsPerSuperBlock +
70 createMemBlockIndex(tid >> 5);
73 memory_block_bitmap_ =
75 memory_block_index_ * kSlabsPerBlock + lane_id];
88 uint32_t read_bitmap = memory_block_bitmap_;
89 uint32_t allocated_result = kNotFoundFlag;
94 while (allocated_result == kNotFoundFlag) {
95 empty_lane = __ffs(~memory_block_bitmap_) - 1;
96 free_lane = __ballot_sync(kSyncLanesMask, empty_lane >= 0);
99 updateMemBlockIndex((threadIdx.x + blockIdx.x * blockDim.x) >>
101 read_bitmap = memory_block_bitmap_;
104 uint32_t src_lane = __ffs(free_lane) - 1;
105 if (src_lane == lane_id) {
106 read_bitmap = atomicCAS(
108 super_block_index_ * kUIntsPerSuperBlock +
109 memory_block_index_ * kSlabsPerBlock + lane_id,
110 memory_block_bitmap_,
111 memory_block_bitmap_ | (1 << empty_lane));
112 if (read_bitmap == memory_block_bitmap_) {
114 memory_block_bitmap_ |= (1 << empty_lane);
116 (super_block_index_ << kSuperBlockMaskBits) |
117 (memory_block_index_ << kBlockMaskBits) |
118 (lane_id << kSlabMaskBits) | empty_lane;
121 memory_block_bitmap_ = read_bitmap;
126 __shfl_sync(kSyncLanesMask, allocated_result, src_lane);
128 return allocated_result;
136 getSuperBlockIndex(ptr) * kUIntsPerSuperBlock +
137 getMemBlockIndex(ptr) * kSlabsPerBlock +
138 (getMemUnitIndex(ptr) >> 5),
139 ~(1 << (getMemUnitIndex(ptr) & 0x1F)));
143 __device__ __host__ __forceinline__
uint32_t
145 return address >> kSuperBlockMaskBits;
147 __device__ __host__ __forceinline__
uint32_t
149 return ((address >> kBlockMaskBits) & 0x1FFFF);
153 return (kBitmapsPerSuperBlock +
154 getMemBlockIndex(address) * kUIntsPerBlock);
156 __device__ __host__ __forceinline__
uint32_t
158 return address & 0x3FF;
162 return getMemUnitIndex(address) * kWarpSize;
166 __device__
void createMemBlockIndex(
uint32_t global_warp_id) {
167 super_block_index_ = global_warp_id % kSuperBlocks;
168 memory_block_index_ = (
hash_coef_ * global_warp_id) >>
169 (32 - kBlocksPerSuperBlockInBits);
173 __device__
void updateMemBlockIndex(
uint32_t global_warp_id) {
175 super_block_index_++;
177 (super_block_index_ == kSuperBlocks) ? 0 : super_block_index_;
178 memory_block_index_ = (
hash_coef_ * (global_warp_id + num_attempts_)) >>
179 (32 - kBlocksPerSuperBlockInBits);
181 memory_block_bitmap_ =
182 *((
super_blocks_ + super_block_index_ * kUIntsPerSuperBlock) +
183 memory_block_index_ * kSlabsPerBlock + (threadIdx.x & 0x1f));
188 return getSuperBlockIndex(address_ptr_index) * kUIntsPerSuperBlock +
189 getMemBlockAddress(address_ptr_index) +
190 getMemUnitIndex(address_ptr_index) * kWarpSize;
193 __host__ __device__
void print_address(
buf_index_t address_ptr_index) {
194 printf(
"Super block Index: %d, Memory block index: %d, Memory unit "
197 getSuperBlockIndex(address_ptr_index),
198 getMemBlockIndex(address_ptr_index),
199 getMemUnitIndex(address_ptr_index));
228 kUIntsPerSuperBlock * kSuperBlocks *
sizeof(
uint32_t),
238 kUIntsPerSuperBlock * kSuperBlocks *
sizeof(
uint32_t)));
240 for (
uint32_t i = 0; i < kSuperBlocks; i++) {
244 kBlocksPerSuperBlock * kSlabsPerBlock *
sizeof(
uint32_t)));
251 const uint32_t num_super_blocks = kSuperBlocks;
253 thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);
254 thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
258 int num_mem_units = kBlocksPerSuperBlock * 32;
259 int num_cuda_blocks =
260 (num_mem_units + kThreadsPerBlock - 1) / kThreadsPerBlock;
262 core::cuda::GetStream()>>>(
263 impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));
267 std::vector<int>
result(num_super_blocks);
268 thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:47
core::Tensor result
Definition: VtkUtils.cpp:75
bool copy
Definition: VtkUtils.cpp:73
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:22
static void Free(void *ptr, const Device &device)
Frees previously allocated memory at address ptr on device device.
Definition: MemoryManager.cpp:28
Definition: SlabNodeManager.h:39
buf_index_t kv_pair_ptrs[kWarpSize - 1]
Definition: SlabNodeManager.h:43
buf_index_t next_slab_ptr
An internal ptr managed by InternalNodeManager.
Definition: SlabNodeManager.h:45
Definition: SlabNodeManager.h:219
SlabNodeManager(const Device &device)
Definition: SlabNodeManager.h:221
void Reset()
Definition: SlabNodeManager.h:235
~SlabNodeManager()
Definition: SlabNodeManager.h:233
SlabNodeManagerImpl impl_
Definition: SlabNodeManager.h:275
Device device_
Definition: SlabNodeManager.h:276
std::vector< int > CountSlabsPerSuperblock()
Definition: SlabNodeManager.h:250
Definition: SlabNodeManager.h:48
uint32_t * super_blocks_
A pointer to each super-block.
Definition: SlabNodeManager.h:204
__device__ __forceinline__ uint32_t * get_ptr_for_bitmap(const uint32_t super_block_idx, const uint32_t bitmap_idx)
Definition: SlabNodeManager.h:61
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:57
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: SlabNodeManager.h:68
uint32_t hash_coef_
hash_coef (register): used as (16 bits, 16 bits) for hashing.
Definition: SlabNodeManager.h:206
__device__ void FreeUntouched(buf_index_t ptr)
Definition: SlabNodeManager.h:134
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:78
SlabNodeManagerImpl()
Definition: SlabNodeManager.h:50
void Synchronize()
Definition: CUDAUtils.cpp:58
uint32_t buf_index_t
Definition: HashBackendBuffer.h:44
__global__ void CountSlabsPerSuperblockKernel(SlabNodeManagerImpl impl, uint32_t *slabs_per_superblock)
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:548
uint32_t RandUint32()
Definition: Random.cpp:59
Definition: PinholeCameraIntrinsic.cpp:16