rtengine/src/runtime/buffer_manager.c
Kevin Trogant 3254af3786 Make progress towards the new builtin asset compiler
Attempts to compile HLSL shaders (with includes)
2024-01-25 09:45:23 +01:00

416 lines
15 KiB
C

#include "buffer_manager.h"
#include "config.h"
#include "runtime.h"
#include "threading.h"
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#if 0
typedef struct rt_buffer_region_s {
void *memory;
int16_t *refcounts; // One per block
uint32_t *bitmap;
size_t block_count;
rt_mutex *guard;
} rt_buffer_region;
#endif
/* Count leading zeroes.
* Note that the return value of __builtin_clz(0) is undefined. */
#ifdef _MSC_VER
#include <intrin.h>
#define lzcnt32(x) __lzcnt((x))
#define popcnt32(x) __popcnt((x))
static __forceinline uint32_t tzcnt32(uint32_t x) {
unsigned long i;
_BitScanForward(&i, x);
return (uint32_t)i;
}
static __forceinline bool IsLZCNTSupported(void) {
#define Type 0x80000001
int info[4];
__cpuid(info, Type);
return (info[2] & (1 << 5)) != 0;
#undef Type
}
#elif defined(__GNUC__)
#define lzcnt32(x) __builtin_clz((x))
#define tzcnt32(x) __builtin_ctz((x))
#define popcnt32(x) __builtin_popcount((x))
#define IsLZCNTSupported() true
#endif
#if 0
/* NOTE(Kevin): Keep these sorted! */
static size_t _block_sizes[] = {RT_KB(512), RT_MB(1), RT_MB(4), RT_MB(8)};
#define NUM_BLOCK_SIZES (sizeof(_block_sizes) / sizeof(_block_sizes[0]))
static rt_buffer_region _regions[NUM_BLOCK_SIZES];
RT_CVAR_SZ(rt_BufferManagerMemory,
"Total number of bytes allocated for the buffer manager. Default: 1GB",
RT_GB(1));
rt_result InitBufferManager(void) {
if ((rt_BufferManagerMemory.sz % NUM_BLOCK_SIZES) != 0)
rtLog("BUFFERMGR",
"Configured memory amount is not dividable by number of block "
"sizes: %u MB/%u",
rt_BufferManagerMemory.sz / (1024 * 1024),
NUM_BLOCK_SIZES);
size_t mem_per_size = rt_BufferManagerMemory.sz / NUM_BLOCK_SIZES;
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
if ((mem_per_size % _block_sizes[i]) != 0)
rtLog("BUFFERMGR",
"Memory per block size is not dividable by block size: %u "
"MB/%u KB",
mem_per_size / (1024 * 1024),
_block_sizes[i] / 1024);
size_t block_count = mem_per_size / _block_sizes[i];
_regions[i].block_count = block_count;
_regions[i].guard = rtCreateMutex();
if (!_regions[i].guard) {
rtReportError("BUFFERMGR", "Failed to create guard mutex %u", i);
return RT_BUFFER_MGR_MUTEX_CREATION_FAILED;
}
_regions[i].memory = malloc(mem_per_size);
if (!_regions[i].memory) {
rtDestroyMutex(_regions[i].guard);
rtReportError("BUFFERMGR", "Failed to allocate memory.", i);
return RT_BUFFER_MGR_OUT_OF_MEMORY;
}
_regions[i].bitmap = calloc((block_count + 31) / 32, sizeof(uint32_t));
if (!_regions[i].bitmap) {
rtDestroyMutex(_regions[i].guard);
free(_regions[i].memory);
rtReportError("BUFFERMGR", "Failed to allocate memory.", i);
return RT_BUFFER_MGR_OUT_OF_MEMORY;
}
_regions[i].refcounts = calloc(block_count, sizeof(uint16_t));
if (!_regions[i].refcounts) {
rtDestroyMutex(_regions[i].guard);
free(_regions[i].memory);
free(_regions[i].bitmap);
rtReportError("BUFFERMGR", "Failed to allocate memory.", i);
return RT_BUFFER_MGR_OUT_OF_MEMORY;
}
}
return RT_SUCCESS;
}
void ShutdownBufferManager(void) {
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
rtDestroyMutex(_regions[i].guard);
free(_regions[i].memory);
free(_regions[i].bitmap);
free(_regions[i].refcounts);
}
}
RT_DLLEXPORT void *rtAllocBuffer(size_t size) {
assert(IsLZCNTSupported());
// Determine the best block size to use
size_t required_blocks = (size + _block_sizes[0] - 1) / _block_sizes[0];
size_t best_fit = 0;
for (size_t i = 1; i < NUM_BLOCK_SIZES; ++i) {
size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i];
if (block_count < required_blocks && size >= _block_sizes[i]) {
required_blocks = block_count;
best_fit = i;
}
}
void *result = NULL;
rt_buffer_region *region = &_regions[best_fit];
rtLockMutex(region->guard);
size_t dword_count = (region->block_count + 31) / 32;
if (required_blocks < 32) {
/* Fast path for allocations that potentially fit into one dword */
uint32_t in_use_mask = (1ull << required_blocks) - 1;
size_t max_occupancy = 32 - required_blocks;
for (size_t i = 0; i < dword_count; ++i) {
size_t block_index = 0;
if (region->bitmap[i] != 0 && popcnt32(region->bitmap[i]) < max_occupancy) {
size_t free_high_blocks = lzcnt32(region->bitmap[i]);
if (free_high_blocks >= required_blocks) {
/* High blocks are free */
size_t first_free = 32 - free_high_blocks;
region->bitmap[i] |= (in_use_mask << first_free);
block_index = i * 32 + first_free;
result = (char *)region->memory + block_index * _block_sizes[best_fit];
} else if (tzcnt32(region->bitmap[i]) >= required_blocks) {
/* Low blocks are free */
region->bitmap[i] |= in_use_mask;
block_index = i * 32;
result = (char *)region->memory + block_index * _block_sizes[best_fit];
} else {
/* Check if we can find a large enough range of free blocks.
* Start after the first set bit.
*/
for (uint32_t j = tzcnt32(region->bitmap[i]) + 1; j < 32 - required_blocks;
++j) {
if ((region->bitmap[i] & in_use_mask << j) == 0) {
region->bitmap[i] |= (in_use_mask << j);
block_index = i * 32 + j;
result = (char *)region->memory + block_index * _block_sizes[best_fit];
break;
}
}
}
} else if (region->bitmap[i] == 0) {
/* All free */
region->bitmap[i] = in_use_mask;
block_index = i * 32;
result = (char *)region->memory + block_index * _block_sizes[best_fit];
} else if (i < dword_count - 1) {
/* Check if we can use high blocks from this dword and low blocks from the next one
*/
size_t high_blocks = lzcnt32(region->bitmap[i]);
size_t low_blocks =
(region->bitmap[i + 1] != 0) ? tzcnt32(region->bitmap[i + 1]) : 32;
if (high_blocks + low_blocks >= required_blocks) {
size_t high_mask = (1u << high_blocks) - 1;
size_t first_free = 32 - high_blocks;
size_t low_mask = (1u << (required_blocks - high_blocks)) - 1;
region->bitmap[i] |= (high_mask << first_free);
region->bitmap[i + 1] |= low_mask;
block_index = i * 32 + first_free;
result = (char *)region->memory + block_index * _block_sizes[best_fit];
}
}
if (result) {
for (size_t j = 0; j < required_blocks; ++j)
region->refcounts[block_index + j] = 1;
break;
}
}
} else {
for (size_t i = 0; i < dword_count; ++i) {
if (region->bitmap[i] == UINT32_MAX) {
continue;
}
/* Check if we can start the allocation here */
}
}
rtUnlockMutex(region->guard);
return result;
}
RT_DLLEXPORT void rtReleaseBuffer(const void *begin, size_t size) {
if (!begin)
return;
uintptr_t begin_addr = (uintptr_t)begin;
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
uintptr_t region_addr = (uintptr_t)_regions[i].memory;
size_t region_size = _block_sizes[i] * _regions[i].block_count;
if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) {
size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i];
size_t first_block = (begin_addr - region_addr) / _block_sizes[i];
rtLockMutex(_regions[i].guard);
for (size_t j = 0; j < block_count; ++j) {
size_t dword = (first_block + j) / 32;
size_t bit = (first_block + j) % 32;
if (--_regions[i].refcounts[first_block + j] == 0)
_regions[i].bitmap[dword] &= ~(1u << bit);
}
rtUnlockMutex(_regions[i].guard);
return;
}
}
rtLog("BUFFERMGR", "Tried to release an invalid buffer");
}
RT_DLLEXPORT void rtIncreaseBufferRefCount(const void *begin, size_t size) {
uintptr_t begin_addr = (uintptr_t)begin;
for (unsigned int i = 0; i < NUM_BLOCK_SIZES; ++i) {
uintptr_t region_addr = (uintptr_t)_regions[i].memory;
size_t region_size = _block_sizes[i] * _regions[i].block_count;
if (begin_addr >= region_addr && begin_addr + size <= region_addr + region_size) {
size_t block_count = (size + _block_sizes[i] - 1) / _block_sizes[i];
size_t first_block = (begin_addr - region_addr) / _block_sizes[i];
rtLockMutex(_regions[i].guard);
for (size_t j = 0; j < block_count; ++j) {
++_regions[i].refcounts[first_block + j];
}
rtUnlockMutex(_regions[i].guard);
return;
}
}
rtLog("BUFFERMGR", "Tried to increase the refcount of an invalid buffer");
}
#endif
#define BLOCK_SIZE 4096u
static uint32_t *_refcounts;
static uint32_t *_bitmap;
static char *_memory;
static rt_mutex *_guard;
static size_t _block_count;
RT_CVAR_I(rt_BufferMemoryBudget,
"The amount of memory to allocate for the buffer manager. Default: 512MB",
RT_MB(512));
extern rt_result InitBufferManager(void) {
_guard = rtCreateMutex();
if (!_guard) {
rtReportError("BUFFERMGR", "Failed to create the buffer manager mutex.");
return RT_UNKNOWN_ERROR;
}
if (!IsLZCNTSupported()) {
rtReportError("BUFFERMGR", "The required lzcnt intrinisc is not supported.");
return RT_UNKNOWN_ERROR;
}
size_t budget = (size_t)rt_BufferMemoryBudget.i;
size_t block_count = budget / BLOCK_SIZE;
if ((budget % block_count) != 0) {
rtLog("BUFFERMGR",
"The configured buffer memory budget %zu is not dividable by the block size (4KB).",
budget);
}
size_t dword_count = (block_count + 31) / 32;
_block_count = block_count;
_memory = malloc(budget + dword_count * sizeof(uint32_t) + block_count * sizeof(uint32_t));
if (!_memory) {
return RT_OUT_OF_MEMORY;
}
_bitmap = (uint32_t*)(_memory + budget);
memset(_bitmap, 0, sizeof(uint32_t) * dword_count);
_refcounts = _bitmap + dword_count;
memset(_refcounts, 0, sizeof(uint32_t) * block_count);
return RT_SUCCESS;
}
extern void ShutdownBufferManager(void) {
rtDestroyMutex(_guard);
}
/* Public API */
RT_DLLEXPORT void *rtAllocBuffer(size_t size) {
size_t alloc_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
size_t dword_count = (_block_count + 31) / 32;
void *result = NULL;
size_t first_block = 0;
rtLockMutex(_guard);
for (size_t i = 0; i < _block_count; ++i) {
size_t dword = i / 32;
if (_bitmap[dword] == 0 || tzcnt32(_bitmap[dword]) >= alloc_blocks) {
size_t mask = (1ull << alloc_blocks) - 1;
_bitmap[dword] |= mask;
result = _memory + i * BLOCK_SIZE;
first_block = i;
}
else if (lzcnt32(_bitmap[dword]) >= alloc_blocks) {
size_t first = (_bitmap[dword] != 0) ? 32 - lzcnt32(_bitmap[dword]) : 0;
size_t mask = ((1ull << alloc_blocks) - 1) << first;
_bitmap[dword] |= mask;
result = _memory + (i + first) * BLOCK_SIZE;
first_block = i + first;
break;
} else if (_bitmap[dword] != UINT32_MAX) {
size_t first = 32 - lzcnt32(_bitmap[dword]);
size_t leftover = alloc_blocks - lzcnt32(_bitmap[dword]);
if (dword == dword_count - 1) {
break; // Reached the end
}
if (leftover < 32) {
size_t next_dword_free = _bitmap[dword + 1] != 0 ? tzcnt32(_bitmap[dword + 1]) : 32;
if (next_dword_free < leftover)
continue;
_bitmap[dword] = UINT32_MAX;
size_t mask = (1ull << leftover) - 1;
_bitmap[dword + 1] |= mask;
result = _memory + (i + first) * BLOCK_SIZE;
first_block = i + first;
break;
} else {
// Check each bit separately
bool free = true;
for (size_t j = i + first; j < i + first + alloc_blocks; ++j) {
size_t dwordj = j / 32;
size_t bitj = j % 32;
if ((_bitmap[dwordj] & (1u << bitj)) != 0) {
free = false;
break;
}
}
if (free) {
for (size_t j = i + first; j < i + first + alloc_blocks; ++j) {
size_t dwordj = j / 32;
size_t bitj = j % 32;
_bitmap[dwordj] |= (1u << bitj);
}
result = _memory + (i + first) * BLOCK_SIZE;
first_block = i + first;
}
}
} else {
/* These 32 blocks are all allocated. Go to the next dword */
assert((i % 32) == 0);
i += 31;
}
}
for (size_t i = first_block; i < first_block + alloc_blocks; ++i)
_refcounts[i] = 1;
rtUnlockMutex(_guard);
rtLog("BUFFERMGR", "Result ptr %llx", (uintptr_t)result);
return result;
}
RT_DLLEXPORT void rtReleaseBuffer(const void *begin, size_t size) {
size_t alloc_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
uintptr_t off = (uintptr_t)begin - (uintptr_t)_memory;
uintptr_t first_block = off / BLOCK_SIZE;
rtLockMutex(_guard);
for (size_t i = first_block; i < first_block + alloc_blocks; ++i) {
if (--_refcounts[i] == 0) {
size_t dword = i / 32;
size_t bit = i % 32;
_bitmap[dword] &= ~(1u << bit);
}
}
rtUnlockMutex(_guard);
}
RT_DLLEXPORT void rtIncreaseBufferRefCount(const void *begin, size_t size) {
size_t alloc_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
uintptr_t off = (uintptr_t)begin - (uintptr_t)_memory;
uintptr_t first_block = off / BLOCK_SIZE;
rtLockMutex(_guard);
for (size_t i = first_block; i < first_block + alloc_blocks; ++i) {
++_refcounts[i];
}
rtUnlockMutex(_guard);
}