416 lines
15 KiB
C
416 lines
15 KiB
C
#include "buffer_manager.h"
|
|
#include "config.h"
|
|
#include "runtime.h"
|
|
#include "threading.h"
|
|
|
|
#include <assert.h>
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#if 0
|
|
typedef struct rt_buffer_region_s {
|
|
void *memory;
|
|
int16_t *refcounts; // One per block
|
|
uint32_t *bitmap;
|
|
size_t block_count;
|
|
rt_mutex *guard;
|
|
} rt_buffer_region;
|
|
#endif
|
|
|
|
/* Count leading zeroes.
|
|
* Note that the return value of __builtin_clz(0) is undefined. */
|
|
#ifdef _MSC_VER
|
|
|
|
#include <intrin.h>
|
|
|
|
#define lzcnt32(x) __lzcnt((x))
|
|
#define popcnt32(x) __popcnt((x))
|
|
|
|
static __forceinline uint32_t tzcnt32(uint32_t x) {
|
|
unsigned long i;
|
|
_BitScanForward(&i, x);
|
|
return (uint32_t)i;
|
|
}
|
|
|
|
static __forceinline bool IsLZCNTSupported(void) {
|
|
#define Type 0x80000001
|
|
int info[4];
|
|
__cpuid(info, Type);
|
|
return (info[2] & (1 << 5)) != 0;
|
|
#undef Type
|
|
}
|
|
|
|
#elif defined(__GNUC__)
|
|
#define lzcnt32(x) __builtin_clz((x))
|
|
#define tzcnt32(x) __builtin_ctz((x))
|
|
#define popcnt32(x) __builtin_popcount((x))
|
|
|
|
#define IsLZCNTSupported() true
|
|
|
|
#endif
|
|
|
|
#if 0
|
|
/* NOTE(Kevin): Keep these sorted! */
|
|
static size_t _block_sizes[] = {RT_KB(512), RT_MB(1), RT_MB(4), RT_MB(8)};
|
|
#define NUM_BLOCK_SIZES (sizeof(_block_sizes) / sizeof(_block_sizes[0]))
|
|
static rt_buffer_region _regions[NUM_BLOCK_SIZES];
|
|
|
|
RT_CVAR_SZ(rt_BufferManagerMemory,
|
|
"Total number of bytes allocated for the buffer manager. Default: 1GB",
|
|
RT_GB(1));
|
|
|
|
rt_result InitBufferManager(void) {
|
|
if ((rt_BufferManagerMemory.sz % NUM_BLOCK_SIZES) != 0)
|
|
rtLog("BUFFERMGR",
|
|
"Configured memory amount is not dividable by number of block "
|
|
"sizes: %u MB/%u",
|
|
rt_BufferManagerMemory.sz / (1024 * 1024),
|
|
NUM_BLOCK_SIZES);
|
|
|
|
size_t mem_per_size = rt_BufferManagerMemory.sz / NUM_BLOCK_SIZES;
|
|
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
|
|
if ((mem_per_size % _block_sizes[i]) != 0)
|
|
rtLog("BUFFERMGR",
|
|
"Memory per block size is not dividable by block size: %u "
|
|
"MB/%u KB",
|
|
mem_per_size / (1024 * 1024),
|
|
_block_sizes[i] / 1024);
|
|
|
|
size_t block_count = mem_per_size / _block_sizes[i];
|
|
_regions[i].block_count = block_count;
|
|
_regions[i].guard = rtCreateMutex();
|
|
if (!_regions[i].guard) {
|
|
rtReportError("BUFFERMGR", "Failed to create guard mutex %u", i);
|
|
return RT_BUFFER_MGR_MUTEX_CREATION_FAILED;
|
|
}
|
|
_regions[i].memory = malloc(mem_per_size);
|
|
if (!_regions[i].memory) {
|
|
rtDestroyMutex(_regions[i].guard);
|
|
rtReportError("BUFFERMGR", "Failed to allocate memory.", i);
|
|
return RT_BUFFER_MGR_OUT_OF_MEMORY;
|
|
}
|
|
_regions[i].bitmap = calloc((block_count + 31) / 32, sizeof(uint32_t));
|
|
if (!_regions[i].bitmap) {
|
|
rtDestroyMutex(_regions[i].guard);
|
|
free(_regions[i].memory);
|
|
rtReportError("BUFFERMGR", "Failed to allocate memory.", i);
|
|
return RT_BUFFER_MGR_OUT_OF_MEMORY;
|
|
}
|
|
_regions[i].refcounts = calloc(block_count, sizeof(uint16_t));
|
|
if (!_regions[i].refcounts) {
|
|
rtDestroyMutex(_regions[i].guard);
|
|
free(_regions[i].memory);
|
|
free(_regions[i].bitmap);
|
|
rtReportError("BUFFERMGR", "Failed to allocate memory.", i);
|
|
return RT_BUFFER_MGR_OUT_OF_MEMORY;
|
|
}
|
|
}
|
|
return RT_SUCCESS;
|
|
}
|
|
|
|
void ShutdownBufferManager(void) {
|
|
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
|
|
rtDestroyMutex(_regions[i].guard);
|
|
free(_regions[i].memory);
|
|
free(_regions[i].bitmap);
|
|
free(_regions[i].refcounts);
|
|
}
|
|
}
|
|
|
|
RT_DLLEXPORT void *rtAllocBuffer(size_t size) {
|
|
assert(IsLZCNTSupported());
|
|
|
|
// Determine the best block size to use
|
|
size_t required_blocks = (size + _block_sizes[0] - 1) / _block_sizes[0];
|
|
size_t best_fit = 0;
|
|
for (size_t i = 1; i < NUM_BLOCK_SIZES; ++i) {
|
|
size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i];
|
|
if (block_count < required_blocks && size >= _block_sizes[i]) {
|
|
required_blocks = block_count;
|
|
best_fit = i;
|
|
}
|
|
}
|
|
|
|
void *result = NULL;
|
|
|
|
rt_buffer_region *region = &_regions[best_fit];
|
|
rtLockMutex(region->guard);
|
|
size_t dword_count = (region->block_count + 31) / 32;
|
|
|
|
if (required_blocks < 32) {
|
|
/* Fast path for allocations that potentially fit into one dword */
|
|
uint32_t in_use_mask = (1ull << required_blocks) - 1;
|
|
size_t max_occupancy = 32 - required_blocks;
|
|
for (size_t i = 0; i < dword_count; ++i) {
|
|
size_t block_index = 0;
|
|
if (region->bitmap[i] != 0 && popcnt32(region->bitmap[i]) < max_occupancy) {
|
|
size_t free_high_blocks = lzcnt32(region->bitmap[i]);
|
|
if (free_high_blocks >= required_blocks) {
|
|
/* High blocks are free */
|
|
size_t first_free = 32 - free_high_blocks;
|
|
region->bitmap[i] |= (in_use_mask << first_free);
|
|
block_index = i * 32 + first_free;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
} else if (tzcnt32(region->bitmap[i]) >= required_blocks) {
|
|
/* Low blocks are free */
|
|
region->bitmap[i] |= in_use_mask;
|
|
block_index = i * 32;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
} else {
|
|
/* Check if we can find a large enough range of free blocks.
|
|
* Start after the first set bit.
|
|
*/
|
|
for (uint32_t j = tzcnt32(region->bitmap[i]) + 1; j < 32 - required_blocks;
|
|
++j) {
|
|
if ((region->bitmap[i] & in_use_mask << j) == 0) {
|
|
region->bitmap[i] |= (in_use_mask << j);
|
|
block_index = i * 32 + j;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else if (region->bitmap[i] == 0) {
|
|
/* All free */
|
|
region->bitmap[i] = in_use_mask;
|
|
block_index = i * 32;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
} else if (i < dword_count - 1) {
|
|
/* Check if we can use high blocks from this dword and low blocks from the next one
|
|
*/
|
|
size_t high_blocks = lzcnt32(region->bitmap[i]);
|
|
size_t low_blocks =
|
|
(region->bitmap[i + 1] != 0) ? tzcnt32(region->bitmap[i + 1]) : 32;
|
|
|
|
if (high_blocks + low_blocks >= required_blocks) {
|
|
size_t high_mask = (1u << high_blocks) - 1;
|
|
size_t first_free = 32 - high_blocks;
|
|
size_t low_mask = (1u << (required_blocks - high_blocks)) - 1;
|
|
|
|
region->bitmap[i] |= (high_mask << first_free);
|
|
region->bitmap[i + 1] |= low_mask;
|
|
block_index = i * 32 + first_free;
|
|
result = (char *)region->memory + block_index * _block_sizes[best_fit];
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
for (size_t j = 0; j < required_blocks; ++j)
|
|
region->refcounts[block_index + j] = 1;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
for (size_t i = 0; i < dword_count; ++i) {
|
|
if (region->bitmap[i] == UINT32_MAX) {
|
|
continue;
|
|
}
|
|
/* Check if we can start the allocation here */
|
|
|
|
}
|
|
}
|
|
rtUnlockMutex(region->guard);
|
|
return result;
|
|
}
|
|
|
|
RT_DLLEXPORT void rtReleaseBuffer(const void *begin, size_t size) {
|
|
if (!begin)
|
|
return;
|
|
uintptr_t begin_addr = (uintptr_t)begin;
|
|
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
|
|
uintptr_t region_addr = (uintptr_t)_regions[i].memory;
|
|
size_t region_size = _block_sizes[i] * _regions[i].block_count;
|
|
if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) {
|
|
|
|
size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i];
|
|
size_t first_block = (begin_addr - region_addr) / _block_sizes[i];
|
|
|
|
rtLockMutex(_regions[i].guard);
|
|
for (size_t j = 0; j < block_count; ++j) {
|
|
size_t dword = (first_block + j) / 32;
|
|
size_t bit = (first_block + j) % 32;
|
|
|
|
if (--_regions[i].refcounts[first_block + j] == 0)
|
|
_regions[i].bitmap[dword] &= ~(1u << bit);
|
|
}
|
|
rtUnlockMutex(_regions[i].guard);
|
|
return;
|
|
}
|
|
}
|
|
rtLog("BUFFERMGR", "Tried to release an invalid buffer");
|
|
}
|
|
|
|
RT_DLLEXPORT void rtIncreaseBufferRefCount(const void *begin, size_t size) {
|
|
uintptr_t begin_addr = (uintptr_t)begin;
|
|
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
|
|
uintptr_t region_addr = (uintptr_t)_regions[i].memory;
|
|
size_t region_size = _block_sizes[i] * _regions[i].block_count;
|
|
if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) {
|
|
|
|
size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i];
|
|
size_t first_block = (begin_addr - region_addr) / _block_sizes[i];
|
|
|
|
rtLockMutex(_regions[i].guard);
|
|
for (size_t j = 0; j < block_count; ++j) {
|
|
++_regions[i].refcounts[first_block + j];
|
|
}
|
|
rtUnlockMutex(_regions[i].guard);
|
|
return;
|
|
}
|
|
}
|
|
rtLog("BUFFERMGR", "Tried to increase the refcount of an invalid buffer");
|
|
}
|
|
#endif
|
|
|
|
#define BLOCK_SIZE 4096u
|
|
|
|
static uint32_t *_refcounts;
|
|
static uint32_t *_bitmap;
|
|
static char *_memory;
|
|
static rt_mutex *_guard;
|
|
|
|
static size_t _block_count;
|
|
|
|
RT_CVAR_I(rt_BufferMemoryBudget,
|
|
"The amount of memory to allocate for the buffer manager. Default: 512MB",
|
|
RT_MB(512));
|
|
|
|
extern rt_result InitBufferManager(void) {
|
|
_guard = rtCreateMutex();
|
|
if (!_guard) {
|
|
rtReportError("BUFFERMGR", "Failed to create the buffer manager mutex.");
|
|
return RT_UNKNOWN_ERROR;
|
|
}
|
|
if (!IsLZCNTSupported()) {
|
|
rtReportError("BUFFERMGR", "The required lzcnt intrinisc is not supported.");
|
|
return RT_UNKNOWN_ERROR;
|
|
}
|
|
|
|
size_t budget = (size_t)rt_BufferMemoryBudget.i;
|
|
size_t block_count = budget / BLOCK_SIZE;
|
|
if ((budget % block_count) != 0) {
|
|
rtLog("BUFFERMGR",
|
|
"The configured buffer memory budget %zu is not dividable by the block size (4KB).",
|
|
budget);
|
|
}
|
|
size_t dword_count = (block_count + 31) / 32;
|
|
_block_count = block_count;
|
|
|
|
_memory = malloc(budget + dword_count * sizeof(uint32_t) + block_count * sizeof(uint32_t));
|
|
if (!_memory) {
|
|
return RT_OUT_OF_MEMORY;
|
|
}
|
|
_bitmap = (uint32_t*)(_memory + budget);
|
|
memset(_bitmap, 0, sizeof(uint32_t) * dword_count);
|
|
_refcounts = _bitmap + dword_count;
|
|
memset(_refcounts, 0, sizeof(uint32_t) * block_count);
|
|
|
|
return RT_SUCCESS;
|
|
}
|
|
|
|
extern void ShutdownBufferManager(void) {
|
|
rtDestroyMutex(_guard);
|
|
}
|
|
|
|
/* Public API */
|
|
|
|
RT_DLLEXPORT void *rtAllocBuffer(size_t size) {
|
|
size_t alloc_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
|
size_t dword_count = (_block_count + 31) / 32;
|
|
void *result = NULL;
|
|
size_t first_block = 0;
|
|
rtLockMutex(_guard);
|
|
for (size_t i = 0; i < _block_count; ++i) {
|
|
size_t dword = i / 32;
|
|
if (_bitmap[dword] == 0 || tzcnt32(_bitmap[dword]) >= alloc_blocks) {
|
|
size_t mask = (1ull << alloc_blocks) - 1;
|
|
_bitmap[dword] |= mask;
|
|
result = _memory + i * BLOCK_SIZE;
|
|
first_block = i;
|
|
}
|
|
else if (lzcnt32(_bitmap[dword]) >= alloc_blocks) {
|
|
size_t first = (_bitmap[dword] != 0) ? 32 - lzcnt32(_bitmap[dword]) : 0;
|
|
size_t mask = ((1ull << alloc_blocks) - 1) << first;
|
|
_bitmap[dword] |= mask;
|
|
result = _memory + (i + first) * BLOCK_SIZE;
|
|
first_block = i + first;
|
|
break;
|
|
} else if (_bitmap[dword] != UINT32_MAX) {
|
|
size_t first = 32 - lzcnt32(_bitmap[dword]);
|
|
size_t leftover = alloc_blocks - lzcnt32(_bitmap[dword]);
|
|
if (dword == dword_count - 1) {
|
|
break; // Reached the end
|
|
}
|
|
if (leftover < 32) {
|
|
size_t next_dword_free = _bitmap[dword + 1] != 0 ? tzcnt32(_bitmap[dword + 1]) : 32;
|
|
if (next_dword_free < leftover)
|
|
continue;
|
|
_bitmap[dword] = UINT32_MAX;
|
|
size_t mask = (1ull << leftover) - 1;
|
|
_bitmap[dword + 1] |= mask;
|
|
result = _memory + (i + first) * BLOCK_SIZE;
|
|
first_block = i + first;
|
|
break;
|
|
} else {
|
|
// Check each bit separately
|
|
bool free = true;
|
|
for (size_t j = i + first; j < i + first + alloc_blocks; ++j) {
|
|
size_t dwordj = j / 32;
|
|
size_t bitj = j % 32;
|
|
if ((_bitmap[dwordj] & (1u << bitj)) != 0) {
|
|
free = false;
|
|
break;
|
|
}
|
|
}
|
|
if (free) {
|
|
for (size_t j = i + first; j < i + first + alloc_blocks; ++j) {
|
|
size_t dwordj = j / 32;
|
|
size_t bitj = j % 32;
|
|
_bitmap[dwordj] |= (1u << bitj);
|
|
}
|
|
result = _memory + (i + first) * BLOCK_SIZE;
|
|
first_block = i + first;
|
|
}
|
|
}
|
|
} else {
|
|
/* These 32 blocks are all allocated. Go to the next dword */
|
|
assert((i % 32) == 0);
|
|
i += 31;
|
|
}
|
|
}
|
|
|
|
for (size_t i = first_block; i < first_block + alloc_blocks; ++i)
|
|
_refcounts[i] = 1;
|
|
|
|
rtUnlockMutex(_guard);
|
|
rtLog("BUFFERMGR", "Result ptr %llx", (uintptr_t)result);
|
|
return result;
|
|
}
|
|
|
|
RT_DLLEXPORT void rtReleaseBuffer(const void *begin, size_t size) {
|
|
size_t alloc_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
|
uintptr_t off = (uintptr_t)begin - (uintptr_t)_memory;
|
|
uintptr_t first_block = off / BLOCK_SIZE;
|
|
rtLockMutex(_guard);
|
|
for (size_t i = first_block; i < first_block + alloc_blocks; ++i) {
|
|
if (--_refcounts[i] == 0) {
|
|
size_t dword = i / 32;
|
|
size_t bit = i % 32;
|
|
_bitmap[dword] &= ~(1u << bit);
|
|
}
|
|
}
|
|
rtUnlockMutex(_guard);
|
|
}
|
|
|
|
RT_DLLEXPORT void rtIncreaseBufferRefCount(const void *begin, size_t size) {
|
|
size_t alloc_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
|
uintptr_t off = (uintptr_t)begin - (uintptr_t)_memory;
|
|
uintptr_t first_block = off / BLOCK_SIZE;
|
|
rtLockMutex(_guard);
|
|
for (size_t i = first_block; i < first_block + alloc_blocks; ++i) {
|
|
++_refcounts[i];
|
|
}
|
|
rtUnlockMutex(_guard);
|
|
} |