260 lines
9.8 KiB
C
260 lines
9.8 KiB
C
#include "buffer_manager.h"
|
|
#include "config.h"
|
|
#include "runtime.h"
|
|
#include "threading.h"
|
|
|
|
#include <assert.h>
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
|
|
typedef struct rt_buffer_region_s {
|
|
void *memory;
|
|
int16_t *refcounts; // One per block
|
|
uint32_t *bitmap;
|
|
size_t block_count;
|
|
rt_mutex *guard;
|
|
} rt_buffer_region;
|
|
|
|
/* Count leading zeroes.
|
|
* Note that the return value of __builtin_clz(0) is undefined. */
|
|
#ifdef _MSC_VER
|
|
|
|
#include <intrin.h>
|
|
|
|
#define lzcnt32(x) __lzcnt((x))
|
|
#define popcnt32(x) __popcnt((x))
|
|
|
|
static __forceinline uint32_t tzcnt32(uint32_t x) {
|
|
unsigned long i;
|
|
_BitScanForward(&i, x);
|
|
return (uint32_t)i;
|
|
}
|
|
|
|
static inline bool IsLZCNTSupported(void) {
|
|
#define Type 0x80000001
|
|
int info[4];
|
|
__cpuid(info, Type);
|
|
return (info[2] & (1 << 5)) != 0;
|
|
#undef Type
|
|
}
|
|
|
|
#elif defined(__GNUC__)
|
|
#define lzcnt32(x) __builtin_clz((x))
|
|
#define tzcnt32(x) __builtin_ctz((x))
|
|
#define popcnt32(x) __builtin_popcount((x))
|
|
|
|
#define IsLZCNTSupported() true
|
|
|
|
#endif
|
|
|
|
/* NOTE(Kevin): Keep these sorted! */
|
|
static size_t _block_sizes[] = {RT_KB(512), RT_MB(1), RT_MB(4), RT_MB(8)};
|
|
#define NUM_BLOCK_SIZES (sizeof(_block_sizes) / sizeof(_block_sizes[0]))
|
|
static rt_buffer_region _regions[NUM_BLOCK_SIZES];
|
|
|
|
RT_CVAR_SZ(rt_BufferManagerMemory,
|
|
"Total number of bytes allocated for the buffer manager. Default: 1GB",
|
|
RT_GB(1));
|
|
|
|
rt_result InitBufferManager(void) {
|
|
if ((rt_BufferManagerMemory.sz % NUM_BLOCK_SIZES) != 0)
|
|
rtLog("BUFFERMGR",
|
|
"Configured memory amount is not dividable by number of block "
|
|
"sizes: %u MB/%u",
|
|
rt_BufferManagerMemory.sz / (1024 * 1024),
|
|
NUM_BLOCK_SIZES);
|
|
|
|
size_t mem_per_size = rt_BufferManagerMemory.sz / NUM_BLOCK_SIZES;
|
|
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
|
|
if ((mem_per_size % _block_sizes[i]) != 0)
|
|
rtLog("BUFFERMGR",
|
|
"Memory per block size is not dividable by block size: %u "
|
|
"MB/%u KB",
|
|
mem_per_size / (1024 * 1024),
|
|
_block_sizes[i] / 1024);
|
|
|
|
size_t block_count = mem_per_size / _block_sizes[i];
|
|
_regions[i].block_count = block_count;
|
|
_regions[i].guard = rtCreateMutex();
|
|
if (!_regions[i].guard) {
|
|
rtReportError("BUFFERMGR", "Failed to create guard mutex %u", i);
|
|
return RT_BUFFER_MGR_MUTEX_CREATION_FAILED;
|
|
}
|
|
_regions[i].memory = malloc(mem_per_size);
|
|
if (!_regions[i].memory) {
|
|
rtDestroyMutex(_regions[i].guard);
|
|
rtReportError("BUFFERMGR", "Failed to allocate memory.", i);
|
|
return RT_BUFFER_MGR_OUT_OF_MEMORY;
|
|
}
|
|
_regions[i].bitmap = calloc((block_count + 31) / 32, sizeof(uint32_t));
|
|
if (!_regions[i].bitmap) {
|
|
rtDestroyMutex(_regions[i].guard);
|
|
free(_regions[i].memory);
|
|
rtReportError("BUFFERMGR", "Failed to allocate memory.", i);
|
|
return RT_BUFFER_MGR_OUT_OF_MEMORY;
|
|
}
|
|
_regions[i].refcounts = calloc(block_count, sizeof(uint16_t));
|
|
if (!_regions[i].refcounts) {
|
|
rtDestroyMutex(_regions[i].guard);
|
|
free(_regions[i].memory);
|
|
free(_regions[i].bitmap);
|
|
rtReportError("BUFFERMGR", "Failed to allocate memory.", i);
|
|
return RT_BUFFER_MGR_OUT_OF_MEMORY;
|
|
}
|
|
}
|
|
return RT_SUCCESS;
|
|
}
|
|
|
|
void ShutdownBufferManager(void) {
|
|
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
|
|
rtDestroyMutex(_regions[i].guard);
|
|
free(_regions[i].memory);
|
|
free(_regions[i].bitmap);
|
|
free(_regions[i].refcounts);
|
|
}
|
|
}
|
|
|
|
RT_DLLEXPORT void *rtAllocBuffer(size_t size) {
|
|
assert(IsLZCNTSupported());
|
|
|
|
// Determine the best block size to use
|
|
size_t required_blocks = (size + _block_sizes[0] - 1) / _block_sizes[0];
|
|
size_t best_fit = 0;
|
|
for (size_t i = 1; i < NUM_BLOCK_SIZES; ++i) {
|
|
size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i];
|
|
if (block_count < required_blocks && size >= _block_sizes[i]) {
|
|
required_blocks = block_count;
|
|
best_fit = i;
|
|
}
|
|
}
|
|
|
|
void *result = NULL;
|
|
|
|
rt_buffer_region *region = &_regions[best_fit];
|
|
rtLockMutex(region->guard);
|
|
size_t dword_count = (region->block_count + 31) / 32;
|
|
|
|
if (required_blocks < 32) {
|
|
/* Fast path for allocations that potentially fit into one dword */
|
|
uint32_t in_use_mask = (1ull << required_blocks) - 1;
|
|
size_t max_occupancy = 32 - required_blocks;
|
|
for (size_t i = 0; i < dword_count; ++i) {
|
|
size_t block_index = 0;
|
|
if (region->bitmap[i] != 0 && popcnt32(region->bitmap[i]) < max_occupancy) {
|
|
size_t free_high_blocks = lzcnt32(region->bitmap[i]);
|
|
if (free_high_blocks >= required_blocks) {
|
|
/* High blocks are free */
|
|
size_t first_free = 32 - free_high_blocks;
|
|
region->bitmap[i] |= (in_use_mask << first_free);
|
|
block_index = i * 32 + first_free;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
} else if (tzcnt32(region->bitmap[i]) >= required_blocks) {
|
|
/* Low blocks are free */
|
|
region->bitmap[i] |= in_use_mask;
|
|
block_index = i * 32;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
} else {
|
|
/* Check if we can find a large enough range of free blocks.
|
|
* Start after the first set bit.
|
|
*/
|
|
for (uint32_t j = tzcnt32(region->bitmap[i]) + 1; j < 32 - required_blocks;
|
|
++j) {
|
|
if ((region->bitmap[i] & in_use_mask << j) == 0) {
|
|
region->bitmap[i] |= (in_use_mask << j);
|
|
block_index = i * 32 + j;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else if (region->bitmap[i] == 0) {
|
|
/* All free */
|
|
region->bitmap[i] = in_use_mask;
|
|
block_index = i * 32;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
} else if (i < dword_count - 1) {
|
|
/* Check if we can use high blocks from this dword and low blocks from the next one
|
|
*/
|
|
size_t high_blocks = lzcnt32(region->bitmap[i]);
|
|
size_t low_blocks =
|
|
(region->bitmap[i + 1] != 0) ? tzcnt32(region->bitmap[i + 1]) : 32;
|
|
|
|
if (high_blocks + low_blocks >= required_blocks) {
|
|
size_t high_mask = (1u << high_blocks) - 1;
|
|
size_t first_free = 32 - high_blocks;
|
|
size_t low_mask = (1u << (required_blocks - high_blocks)) - 1;
|
|
|
|
region->bitmap[i] |= (high_mask << first_free);
|
|
region->bitmap[i + 1] |= low_mask;
|
|
block_index = i * 32 + first_free;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
for (size_t j = 0; j < required_blocks; ++j)
|
|
region->refcounts[block_index + j] = 1;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
for (size_t i = 0; i < dword_count; ++i) {
|
|
if (region->bitmap[i] == UINT32_MAX) {
|
|
continue;
|
|
}
|
|
/* Check if we can start the allocation here */
|
|
|
|
}
|
|
}
|
|
rtUnlockMutex(region->guard);
|
|
return result;
|
|
}
|
|
|
|
RT_DLLEXPORT void rtReleaseBuffer(const void *begin, size_t size) {
|
|
if (!begin)
|
|
return;
|
|
uintptr_t begin_addr = (uintptr_t)begin;
|
|
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
|
|
uintptr_t region_addr = (uintptr_t)_regions[i].memory;
|
|
size_t region_size = _block_sizes[i] * _regions[i].block_count;
|
|
if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) {
|
|
|
|
size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i];
|
|
size_t first_block = (begin_addr - region_addr) / _block_sizes[i];
|
|
|
|
rtLockMutex(_regions[i].guard);
|
|
for (size_t j = 0; j < block_count; ++j) {
|
|
size_t dword = (first_block + j) / 32;
|
|
size_t bit = (first_block + j) % 32;
|
|
|
|
if (--_regions[i].refcounts[first_block + j] == 0)
|
|
_regions[i].bitmap[dword] &= ~(1u << bit);
|
|
}
|
|
rtUnlockMutex(_regions[i].guard);
|
|
return;
|
|
}
|
|
}
|
|
rtLog("BUFFERMGR", "Tried to release an invalid buffer");
|
|
}
|
|
|
|
RT_DLLEXPORT void rtIncreaseBufferRefCount(const void *begin, size_t size) {
|
|
uintptr_t begin_addr = (uintptr_t)begin;
|
|
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
|
|
uintptr_t region_addr = (uintptr_t)_regions[i].memory;
|
|
size_t region_size = _block_sizes[i] * _regions[i].block_count;
|
|
if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) {
|
|
|
|
size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i];
|
|
size_t first_block = (begin_addr - region_addr) / _block_sizes[i];
|
|
|
|
rtLockMutex(_regions[i].guard);
|
|
for (size_t j = 0; j < block_count; ++j) {
|
|
++_regions[i].refcounts[first_block + j];
|
|
}
|
|
rtUnlockMutex(_regions[i].guard);
|
|
return;
|
|
}
|
|
}
|
|
rtLog("BUFFERMGR", "Tried to increase the refcount of an invalid buffer");
|
|
} |