From 887fa63c430a10c788b46ea664d2dad0dd9ea77b Mon Sep 17 00:00:00 2001
From: Kevin Trogant <kevin@trogant.de>
Date: Tue, 13 Feb 2024 08:35:19 +0100
Subject: [PATCH] Manage command buffers

I decided to make queues explicit, to simplify handling queue ownership
transfers in the renderer code. The framegraph + pass code has explicit knowledge
about resource ownership, so it makes sense to handle it there.

- Manage pools
- Allocate command buffers
- Submit command buffers
---
 src/renderer/vk/command_buffers.c | 286 +++++++++++++++++++++++++++++-
 src/renderer/vk/command_buffers.h |   2 +
 src/renderer/vk/frame.c           |   9 +
 src/renderer/vk/gpu.h             |  14 ++
 src/renderer/vk/helper.c          |  13 ++
 src/renderer/vk/init.c            |  41 ++++-
 src/renderer/vk/meson.build       |   1 +
 src/renderer/vk/swapchain.c       |   7 +-
 src/runtime/atomics.h             |  29 +++
 src/runtime/gfx.h                 |   2 +
 src/runtime/gfx_main.c            |  18 +-
 src/runtime/main_loop.c           |  10 +-
 src/runtime/main_loop.h           |   4 +-
 src/runtime/meson.build           |   1 +
 src/runtime/renderer_api.h        |  63 +++++--
 src/runtime/runtime.h             |   2 +
 16 files changed, 455 insertions(+), 47 deletions(-)
 create mode 100644 src/renderer/vk/frame.c
 create mode 100644 src/runtime/atomics.h

diff --git a/src/renderer/vk/command_buffers.c b/src/renderer/vk/command_buffers.c
index 4f546b1..fb77acb 100644
--- a/src/renderer/vk/command_buffers.c
+++ b/src/renderer/vk/command_buffers.c
@@ -1,14 +1,284 @@
-#include "runtime/renderer_api.h"
+#include "gpu.h"
+#include "swapchain.h"
 
-rt_result
-RT_RENDERER_API_FN(AllocCommandBuffers)(uint32_t count,
-                                        rt_render_command_buffer_handle *p_command_buffers,
-                                        rt_gpu_semaphore_handle *p_semaphores) {
+#include "runtime/atomics.h"
+#include "runtime/config.h"
+#include "runtime/mem_arena.h"
+#include "runtime/renderer_api.h"
+#include "runtime/runtime.h"
+
+#include <stdlib.h>
+
+RT_CVAR_I(rt_VkMaxCommandPools,
+          "Maximum number of command pools that can be created. Default: 32",
+          32);
+RT_CVAR_I(
+    rt_VkCommandBufferRingBufferSize,
+    "Size of the ring buffer used to store command buffers. Must be a power of two! Default: 512",
+    512);
+
+typedef struct {
+    VkCommandPool pools[RT_VK_MAX_SUPPORTED_FRAMES_IN_FLIGHT * 3];
+    uint32_t distinct_pool_count;
+
+    VkCommandPool *compute_pools;
+    VkCommandPool *graphics_pools;
+    VkCommandPool *transfer_pools;
+} rt_thread_pools;
+
+typedef struct {
+    VkCommandBuffer command_buffer;
+    uint32_t version;
+    rt_gpu_queue target_queue;
+} rt_command_buffer;
+
+static rt_thread_pools *_pools;
+static uint32_t _next_pools;
+static RT_THREAD_LOCAL unsigned int t_first_pool;
+
+static rt_command_buffer *_command_buffers;
+/* We let this overflow on its own. Use MOD rt_VkCommandBufferRingBufferSize to get the actual
+ * index. */
+static uint32_t _next_command_buffer;
+
+rt_result InitCommandBufferManagement(void) {
+    _pools = calloc((size_t)rt_VkMaxCommandPools.i, sizeof(rt_thread_pools));
+    if (!_pools)
+        return RT_OUT_OF_MEMORY;
+
+    _command_buffers =
+        calloc((size_t)rt_VkCommandBufferRingBufferSize.i, sizeof(rt_command_buffer));
+    if (!_command_buffers) {
+        free(_pools);
+        return RT_OUT_OF_MEMORY;
+    }
+
+    /* Keep 0 free as a "Not initialized" value for t_first_pool */
+    _next_pools = 1;
     return RT_SUCCESS;
 }
 
-rt_result
-RT_RENDERER_API_FN(SubmitCommandBuffers)(uint32_t count,
-                                         const rt_render_command_buffer_handle *command_buffers) {
+static void DestroyPools(rt_thread_pools *pools) {
+    for (uint32_t j = 0; j < pools->distinct_pool_count; ++j)
+        vkDestroyCommandPool(g_gpu.device, pools->pools[j], g_gpu.alloc_cb);
+    free(_pools);
+}
+
+void ShutdownCommandBufferManagement(void) {
+    /* _next_pools is the number of existing pools */
+    for (uint32_t i = 1; i < _next_pools; ++i) {
+        DestroyPools(&_pools[i]);
+    }
+}
+
+void rtResetCommandPools(unsigned int frame_id) {
+    unsigned int pool_idx = frame_id % g_gpu.max_frames_in_flight;
+    for (uint32_t i = 1; i < _next_pools; ++i) {
+        if (vkResetCommandPool(g_gpu.device,
+                           _pools[i].graphics_pools[pool_idx],
+                               VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) != VK_SUCCESS) {
+            rtLog("vk", "Failed to reset graphics pool slot %u index %u", i, pool_idx);
+        }
+        if (_pools[i].compute_pools != _pools[i].graphics_pools) {
+            if (vkResetCommandPool(g_gpu.device,
+                                   _pools[i].compute_pools[pool_idx],
+                                   VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) != VK_SUCCESS) {
+                rtLog("vk", "Failed to reset compute pool slot %u index %u", i, pool_idx);
+            }
+        }
+        if (_pools[i].transfer_pools != _pools[i].graphics_pools &&
+            _pools[i].transfer_pools != _pools[i].compute_pools) {
+            if (vkResetCommandPool(g_gpu.device,
+                                   _pools[i].transfer_pools[pool_idx],
+                                   VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT) != VK_SUCCESS) {
+                rtLog("vk", "Failed to reset transfer pool slot %u index %u", i, pool_idx);
+            }
+        }
+    }
+}
+
+static rt_result CreatePools(rt_thread_pools *pools) {
+    /* Graphics pools */
+    pools->graphics_pools                 = pools->pools;
+    pools->distinct_pool_count            = 0;
+    VkCommandPoolCreateInfo graphics_info = {.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+                                             .queueFamilyIndex = g_gpu.graphics_family,
+                                             .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT};
+    for (uint32_t i = 0; i < g_gpu.max_frames_in_flight; ++i) {
+        if (vkCreateCommandPool(g_gpu.device,
+                                &graphics_info,
+                                g_gpu.alloc_cb,
+                                &pools->graphics_pools[i]) != VK_SUCCESS) {
+            rtLog("vk", "Failed to create a graphics command pool.");
+            DestroyPools(pools);
+            return RT_UNKNOWN_ERROR;
+        }
+        ++pools->distinct_pool_count;
+    }
+
+    if (g_gpu.compute_family != g_gpu.graphics_family) {
+        VkCommandPoolCreateInfo compute_info = {
+            .sType            = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+            .queueFamilyIndex = g_gpu.compute_family,
+            .flags            = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,
+        };
+        pools->compute_pools = &pools->pools[pools->distinct_pool_count];
+        for (uint32_t i = 0; i < g_gpu.max_frames_in_flight; ++i) {
+            if (vkCreateCommandPool(g_gpu.device,
+                                    &compute_info,
+                                    g_gpu.alloc_cb,
+                                    &pools->compute_pools[i]) != VK_SUCCESS) {
+                rtLog("vk", "Failed to create a compute command pool.");
+                DestroyPools(pools);
+                return RT_UNKNOWN_ERROR;
+            }
+            ++pools->distinct_pool_count;
+        }
+    } else {
+        pools->compute_pools = pools->graphics_pools;
+    }
+
+    if (g_gpu.transfer_family != g_gpu.graphics_family &&
+        g_gpu.transfer_family != g_gpu.compute_family) {
+        VkCommandPoolCreateInfo transfer_info = {
+            .sType            = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+            .queueFamilyIndex = g_gpu.transfer_family,
+            .flags            = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,
+        };
+        pools->transfer_pools = &pools->pools[pools->distinct_pool_count];
+        for (uint32_t i = 0; i < g_gpu.max_frames_in_flight; ++i) {
+            if (vkCreateCommandPool(g_gpu.device,
+                                    &transfer_info,
+                                    g_gpu.alloc_cb,
+                                    &pools->transfer_pools[i]) != VK_SUCCESS) {
+                rtLog("vk", "Failed to create a transfer command pool.");
+                DestroyPools(pools);
+                return RT_UNKNOWN_ERROR;
+            }
+            ++pools->distinct_pool_count;
+        }
+    } else if (g_gpu.transfer_family == g_gpu.graphics_family) {
+        pools->transfer_pools = pools->graphics_pools;
+    } else if (g_gpu.transfer_family == g_gpu.compute_family) {
+        pools->transfer_pools = pools->compute_pools;
+    }
     return RT_SUCCESS;
+}
+
+rt_result RT_RENDERER_API_FN(AllocCommandBuffers)(uint32_t count,
+                                                  const rt_alloc_command_buffer_info *info,
+                                                  rt_command_buffer_handle *p_command_buffers) {
+    rt_thread_pools *pools = &_pools[t_first_pool];
+    if (t_first_pool == 0) {
+        /* Acquire pools */
+        t_first_pool = rtAtomic32Inc(&_next_pools);
+        RT_ASSERT((int)t_first_pool < rt_VkMaxCommandPools.i, "Too many command pools created.");
+
+        pools                = &_pools[t_first_pool];
+        rt_result create_res = CreatePools(pools);
+        if (create_res != RT_SUCCESS)
+            return create_res;
+    }
+    if ((int)t_first_pool >= rt_VkMaxCommandPools.i)
+        return RT_OUT_OF_MEMORY;
+
+    uint32_t frame_id = 0;
+    rt_result result  = RT_SUCCESS;
+
+    /* TODO: We should probably batch allocations of the same type */
+    uint32_t mod   = (uint32_t)rt_VkCommandBufferRingBufferSize.i;
+    uint32_t start = rtAtomic32FetchAdd(&_next_command_buffer, count);
+    for (uint32_t i = 0; i < count; ++i) {
+        uint32_t slot = (start + i) % mod;
+        _command_buffers[slot].version =
+            (_command_buffers[slot].version + 1) % RT_RENDER_BACKEND_HANDLE_MAX_VERSION;
+        if (_command_buffers[slot].version == 0)
+            _command_buffers[slot].version = 1;
+
+        VkCommandPool pool = pools->graphics_pools[frame_id];
+        if (info[i].target_queue == RT_COMPUTE_QUEUE)
+            pool = pools->compute_pools[frame_id];
+        else if (info[i].target_queue == RT_TRANSFER_QUEUE)
+            pool = pools->transfer_pools[frame_id];
+        _command_buffers[slot].target_queue = info[i].target_queue;
+
+        VkCommandBufferAllocateInfo alloc_info = {
+            .sType              = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+            .level              = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+            .commandBufferCount = 1,
+            .commandPool        = pool,
+        };
+        if (vkAllocateCommandBuffers(g_gpu.device,
+                                     &alloc_info,
+                                     &_command_buffers[slot].command_buffer) != VK_SUCCESS) {
+            result = RT_UNKNOWN_ERROR;
+            break;
+        }
+
+        p_command_buffers[i].index   = slot;
+        p_command_buffers[i].version = _command_buffers[slot].version;
+    }
+
+    return result;
+}
+
+rt_result RT_RENDERER_API_FN(SubmitCommandBuffers)(rt_gpu_queue queue,
+                                                   const rt_submit_command_buffers_info *info) {
+
+    uint32_t count     = info->command_buffer_count;
+    rt_temp_arena temp = rtGetTemporaryArena(NULL, 0);
+    if (!temp.arena)
+        return RT_OUT_OF_MEMORY;
+
+    rt_result result     = RT_SUCCESS;
+    VkQueue target_queue = rtGetQueue(queue);
+
+    VkCommandBuffer *command_buffers = RT_ARENA_PUSH_ARRAY(temp.arena, VkCommandBuffer, count);
+    if (!command_buffers) {
+        result = RT_OUT_OF_MEMORY;
+        goto out;
+    }
+
+    for (uint32_t i = 0; i < count; ++i) {
+        uint32_t slot = info->command_buffers[i].index;
+        if (_command_buffers[slot].version != info->command_buffers[i].version) {
+            rtLog("vk",
+                  "Mismatch between handle version and stored version while submitting a command "
+                  "buffer");
+            result = RT_INVALID_VALUE;
+            goto out;
+        }
+        if (_command_buffers[slot].target_queue != queue) {
+            rtLog("vk", "Mismatch between command buffer target queue and submit target queue.");
+            result = RT_INVALID_VALUE;
+            goto out;
+        }
+        command_buffers[i] = _command_buffers[slot].command_buffer;
+    }
+
+    /* TODO(Kevin): Retrieve semaphores */
+    VkSemaphore *wait_semaphores   = NULL;
+    VkSemaphore *signal_semaphores = NULL;
+    uint32_t wait_count            = 0;
+    uint32_t signal_count          = 0;
+
+    VkSubmitInfo submit_info = {
+        .sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .pCommandBuffers      = command_buffers,
+        .commandBufferCount   = count,
+        .pWaitSemaphores      = wait_semaphores,
+        .pWaitDstStageMask    = NULL,
+        .waitSemaphoreCount   = wait_count,
+        .pSignalSemaphores    = signal_semaphores,
+        .signalSemaphoreCount = signal_count,
+    };
+
+    if (vkQueueSubmit(target_queue, 1, &submit_info, VK_NULL_HANDLE) != VK_SUCCESS) {
+        rtLog("vk", "vkQueueSubmit failed.");
+        result = RT_UNKNOWN_ERROR;
+    }
+
+out:
+    rtReturnTemporaryArena(temp);
+    return result;
 }
\ No newline at end of file
diff --git a/src/renderer/vk/command_buffers.h b/src/renderer/vk/command_buffers.h
index 2a8ecf7..d8803ae 100644
--- a/src/renderer/vk/command_buffers.h
+++ b/src/renderer/vk/command_buffers.h
@@ -3,4 +3,6 @@
 
 #include "runtime/runtime.h"
 
+void rtResetCommandPools(unsigned int frame_id);
+
 #endif
diff --git a/src/renderer/vk/frame.c b/src/renderer/vk/frame.c
new file mode 100644
index 0000000..5a9af8d
--- /dev/null
+++ b/src/renderer/vk/frame.c
@@ -0,0 +1,9 @@
+#include "gpu.h"
+#include "command_buffers.h"
+
+#include "runtime/renderer_api.h"
+
+void RT_RENDERER_API_FN(BeginFrame)(unsigned int frame_id) {
+    g_gpu.current_frame_id = frame_id;
+    rtResetCommandPools(frame_id);
+}
\ No newline at end of file
diff --git a/src/renderer/vk/gpu.h b/src/renderer/vk/gpu.h
index 3bf5422..7b54351 100644
--- a/src/renderer/vk/gpu.h
+++ b/src/renderer/vk/gpu.h
@@ -9,6 +9,13 @@
 
 #include "runtime/renderer_api.h"
 
+/* Minimum supported value of g_gpu.max_frames_in_flight */
+#define RT_VK_MIN_SUPPORTED_FRAMES_IN_FLIGHT 2
+
+/* Maximum supported number of frames in flight.
+ * The actually configured value is contained in g_gpu. */
+#define RT_VK_MAX_SUPPORTED_FRAMES_IN_FLIGHT 3
+
 #ifdef _WIN32
 struct HINSTANCE__;
 struct HWND__;
@@ -36,9 +43,11 @@ typedef struct {
     VkQueue graphics_queue;
     VkQueue compute_queue;
     VkQueue present_queue;
+    VkQueue transfer_queue;
     uint32_t graphics_family;
     uint32_t compute_family;
     uint32_t present_family;
+    uint32_t transfer_family;
 
     rt_native_window native_window;
 
@@ -48,6 +57,9 @@ typedef struct {
     VkPhysicalDeviceFeatures phys_device_features;
 
     VmaAllocator allocator;
+
+    unsigned int max_frames_in_flight;
+    unsigned int current_frame_id;
 } rt_vk_gpu;
 
 #ifndef RT_VK_DONT_DEFINE_GPU_GLOBAL
@@ -60,4 +72,6 @@ VkFormat rtPixelFormatToVkFormat(rt_pixel_format format);
 
 VkSampleCountFlagBits rtSampleCountToFlags(unsigned int count);
 
+VkQueue rtGetQueue(rt_gpu_queue queue);
+
 #endif
diff --git a/src/renderer/vk/helper.c b/src/renderer/vk/helper.c
index 81fbf2d..2de9a4f 100644
--- a/src/renderer/vk/helper.c
+++ b/src/renderer/vk/helper.c
@@ -40,4 +40,17 @@ VkSampleCountFlagBits rtSampleCountToFlags(unsigned int count) {
             break;
     }
     return (VkSampleCountFlagBits)count;
+}
+
+VkQueue rtGetQueue(rt_gpu_queue queue) {
+    switch (queue) {
+    case RT_GRAPHICS_QUEUE:
+        return g_gpu.graphics_queue;
+    case RT_COMPUTE_QUEUE:
+        return g_gpu.compute_queue;
+    case RT_TRANSFER_QUEUE:
+        return g_gpu.transfer_queue;
+    default:
+        return VK_NULL_HANDLE;
+    }
 }
\ No newline at end of file
diff --git a/src/renderer/vk/init.c b/src/renderer/vk/init.c
index 254a40c..34d6d6c 100644
--- a/src/renderer/vk/init.c
+++ b/src/renderer/vk/init.c
@@ -19,6 +19,8 @@ RT_CVAR_I(r_VkEnableAPIAllocTracking,
 
 RT_CVAR_S(r_VkPhysDeviceName, "Name of the selected physical device. Default: \"\"", "");
 
+RT_CVAR_I(r_VkMaxFramesInFlight, "Maximum number of frames in flight. [2/3] Default: 2", 2);
+
 rt_vk_gpu g_gpu;
 
 static VkAllocationCallbacks _tracking_alloc_cbs;
@@ -82,12 +84,15 @@ DebugUtilsMessengerCb(VkDebugUtilsMessageSeverityFlagBitsEXT severity,
 
 extern rt_cvar r_VkPreferredSwapchainImages;
 extern rt_cvar r_VkPreferMailboxMode;
+extern rt_cvar r_VkMaxPipelineCount;
 
 void RT_RENDERER_API_FN(RegisterCVars)(void) {
     rtRegisterCVAR(&r_VkEnableAPIAllocTracking);
     rtRegisterCVAR(&r_VkPhysDeviceName);
     rtRegisterCVAR(&r_VkPreferredSwapchainImages);
     rtRegisterCVAR(&r_VkPreferMailboxMode);
+    rtRegisterCVAR(&r_VkMaxFramesInFlight);
+    rtRegisterCVAR(&r_VkMaxPipelineCount);
 }
 
 static rt_result CreateInstance(void) {
@@ -211,12 +216,14 @@ typedef struct {
     uint32_t graphics;
     uint32_t compute;
     uint32_t present;
+    uint32_t transfer;
 } rt_queue_indices;
 
 static rt_queue_indices RetrieveQueueIndices(VkPhysicalDevice phys_dev, VkSurfaceKHR surface) {
     rt_queue_indices indices = {.graphics = UINT32_MAX,
                                 .compute  = UINT32_MAX,
-                                .present  = UINT32_MAX};
+                                .present  = UINT32_MAX,
+                                .transfer = UINT32_MAX};
 
     uint32_t count = 0;
     vkGetPhysicalDeviceQueueFamilyProperties(phys_dev, &count, NULL);
@@ -232,12 +239,20 @@ static rt_queue_indices RetrieveQueueIndices(VkPhysicalDevice phys_dev, VkSurfac
             indices.graphics = i;
         if ((props[i].queueFlags & VK_QUEUE_COMPUTE_BIT) != 0)
             indices.compute = i;
+        if ((props[i].queueFlags & VK_QUEUE_TRANSFER_BIT) != 0)
+            indices.transfer = i;
 
         VkBool32 present_supported = VK_FALSE;
         vkGetPhysicalDeviceSurfaceSupportKHR(phys_dev, i, surface, &present_supported);
         if (present_supported)
             indices.present = i;
     }
+
+    if (indices.transfer == UINT32_MAX && indices.graphics != UINT32_MAX)
+        indices.transfer = indices.graphics;
+    else if (indices.transfer == UINT32_MAX && indices.compute != UINT32_MAX)
+        indices.transfer = indices.compute;
+
     free(props);
     return indices;
 }
@@ -405,11 +420,12 @@ static rt_result CreateDevice(void) {
     g_gpu.compute_family  = queue_indices.compute;
     g_gpu.graphics_family = queue_indices.graphics;
     g_gpu.present_family  = queue_indices.present;
+    g_gpu.transfer_family = queue_indices.transfer;
 
     float priority = 1.f;
 
     uint32_t distinct_queue_count = 1;
-    VkDeviceQueueCreateInfo queue_info[3];
+    VkDeviceQueueCreateInfo queue_info[4];
     queue_info[0].sType            = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
     queue_info[0].pNext            = NULL;
     queue_info[0].flags            = 0;
@@ -433,6 +449,17 @@ static rt_result CreateDevice(void) {
         queue_info[distinct_queue_count].queueCount = 1;
         queue_info[distinct_queue_count].queueFamilyIndex = queue_indices.present;
         queue_info[distinct_queue_count].pQueuePriorities = &priority;
+        ++distinct_queue_count;
+    }
+    if (queue_indices.transfer != queue_indices.graphics &&
+        queue_indices.transfer != queue_indices.compute) {
+        queue_info[distinct_queue_count].sType      = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+        queue_info[distinct_queue_count].pNext      = NULL;
+        queue_info[distinct_queue_count].flags      = 0;
+        queue_info[distinct_queue_count].queueCount = 1;
+        queue_info[distinct_queue_count].queueFamilyIndex = queue_indices.transfer;
+        queue_info[distinct_queue_count].pQueuePriorities = &priority;
+        ++distinct_queue_count;
     }
 
     VkPhysicalDeviceDescriptorIndexingFeatures indexing_features = {
@@ -463,6 +490,7 @@ static rt_result CreateDevice(void) {
     vkGetDeviceQueue(g_gpu.device, queue_indices.graphics, 0, &g_gpu.graphics_queue);
     vkGetDeviceQueue(g_gpu.device, queue_indices.compute, 0, &g_gpu.compute_queue);
     vkGetDeviceQueue(g_gpu.device, queue_indices.present, 0, &g_gpu.present_queue);
+    vkGetDeviceQueue(g_gpu.device, queue_indices.transfer, 0, &g_gpu.transfer_queue);
 
     return RT_SUCCESS;
 }
@@ -519,6 +547,8 @@ extern rt_result InitPipelineManagement(void);
 extern void ShutdownPipelineManagement(void);
 extern rt_result InitRenderTargetManagement(void);
 extern void ShutdownRenderTargetManagement(void);
+extern rt_result InitCommandBufferManagement(void);
+extern void ShutdownCommandBufferManagement(void);
 
 rt_result RT_RENDERER_API_FN(Init)(const rt_renderer_init_info *info) {
     rtLog("vk", "Init");
@@ -533,6 +563,9 @@ rt_result RT_RENDERER_API_FN(Init)(const rt_renderer_init_info *info) {
     } else {
         g_gpu.alloc_cb = NULL;
     }
+    g_gpu.max_frames_in_flight = RT_RESTRICT_VALUE_TO_BOUNDS(r_VkMaxFramesInFlight.i,
+                                                             RT_VK_MIN_SUPPORTED_FRAMES_IN_FLIGHT,
+                                                             RT_VK_MAX_SUPPORTED_FRAMES_IN_FLIGHT);
 
     int res = CreateInstance();
     if (res != RT_SUCCESS)
@@ -553,6 +586,9 @@ rt_result RT_RENDERER_API_FN(Init)(const rt_renderer_init_info *info) {
     if (res != RT_SUCCESS)
         return res;
     res = InitRenderTargetManagement();
+    if (res != RT_SUCCESS)
+        return res;
+    res = InitCommandBufferManagement();
     if (res != RT_SUCCESS)
         return res;
     res = rtCreateSwapchain();
@@ -566,6 +602,7 @@ void RT_RENDERER_API_FN(Shutdown)(void) {
     rtLog("vk", "Shutdown");
     vkDeviceWaitIdle(g_gpu.device);
     rtDestroySwapchain();
+    ShutdownCommandBufferManagement();
     ShutdownRenderTargetManagement();
     ShutdownPipelineManagement();
     DestroyAllocator();
diff --git a/src/renderer/vk/meson.build b/src/renderer/vk/meson.build
index f3c002e..dd5191a 100644
--- a/src/renderer/vk/meson.build
+++ b/src/renderer/vk/meson.build
@@ -16,6 +16,7 @@ if vk_dep.found()
     'swapchain.h',
 
     'command_buffers.c',
+    'frame.c',
     'helper.c',
     'init.c',
     'pipelines.c',
diff --git a/src/renderer/vk/swapchain.c b/src/renderer/vk/swapchain.c
index aaee21a..509827d 100644
--- a/src/renderer/vk/swapchain.c
+++ b/src/renderer/vk/swapchain.c
@@ -121,12 +121,7 @@ rt_result rtCreateSwapchain(void) {
         return 50;
     }
     g_swapchain.format = device_params.surface_format.format;
-    g_swapchain.extent = 
-        
-        
-        
-        
-        device_params.extent;
+    g_swapchain.extent = device_params.extent;
 
     /* Retrieve images */
     g_swapchain.image_count = 0;
diff --git a/src/runtime/atomics.h b/src/runtime/atomics.h
new file mode 100644
index 0000000..1d63d3e
--- /dev/null
+++ b/src/runtime/atomics.h
@@ -0,0 +1,29 @@
+#ifndef RT_ATOMICS_H
+#define RT_ATOMICS_H
+
+/* Macros & helpers for atomic instructions */
+
+#ifdef _MSC_VER
+
+/* Increment and decrement return the new value */
+
+#define rtAtomic32Inc(pa) _InterlockedIncrement((volatile LONG *)(pa))
+#define rtAtomic64Inc(pa) _InterlockedIncrement64((volatile LONG64 *)(pa))
+#define rtAtomic32Dec(pa) _InterlockedDecrement((volatile LONG *)(pa))
+#define rtAtomic64Dec(pa) _InterlockedDecrement64((volatile LONG64 *)(pa))
+
+#define rtAtomic32FetchAdd(pa, value) _InterlockedExchangeAdd((volatile LONG *)(pa), (LONG)(value))
+#define rtAtomic64FetchAdd(pa, value) _InterlockedExchangeAdd64((volatile LONG64 *)(pa), (LONG)(value))
+
+#elif defined(__GNUC__) || defined(__clang__)
+
+#define rtAtomic32Inc(pa) __atomic_add_fetch((pa), 1, __ATOMIC_SEQ_CST)
+#define rtAtomic64Inc(pa) __atomic_add_fetch((pa), 1LL, __ATOMIC_SEQ_CST)
+#define rtAtomic32Dec(pa) __atomic_sub_fetch((pa), 1, __ATOMIC_SEQ_CST)
+#define rtAtomic64Dec(pa) __atomic_sub_fetch((pa), 1LL, __ATOMIC_SEQ_CST)
+
+#define rtAtomic32FetchAdd(pa, value) __atomic_fetch_add((pa), (value), __ATOMIC_SEQ_CST)
+#define rtAtomic64FetchAdd(pa, value) _-atomic_fetch_add((pa), (value), __ATOMIC_SEQ_CST)
+#endif
+
+#endif
diff --git a/src/runtime/gfx.h b/src/runtime/gfx.h
index 4357de2..75ee5f6 100644
--- a/src/runtime/gfx.h
+++ b/src/runtime/gfx.h
@@ -61,6 +61,8 @@ RT_DLLEXPORT rt_result rtInitGFX(rt_renderer_init_info *renderer_info);
 
 RT_DLLEXPORT void rtShutdownGFX(void);
 
+RT_DLLEXPORT void rtBeginGFXFrame(unsigned int frame_id);
+
 /* *********************************************************************
  * Framegraph API
  *
diff --git a/src/runtime/gfx_main.c b/src/runtime/gfx_main.c
index 7f9f6d7..c288472 100644
--- a/src/runtime/gfx_main.c
+++ b/src/runtime/gfx_main.c
@@ -25,17 +25,17 @@ RT_CVAR_S(rt_Renderer, "Select the render backend. Available options: [vk], Defa
 extern void RT_RENDERER_API_FN(RegisterCVars)(void);
 extern rt_result RT_RENDERER_API_FN(Init)(const rt_renderer_init_info *);
 extern void RT_RENDERER_API_FN(Shutdown)(void);
+extern void RT_RENDERER_API_FN(BeginFrame)(unsigned int);
 extern rt_pipeline_handle RT_RENDERER_API_FN(CompilePipeline)(const rt_pipeline_info *);
 extern void RT_RENDERER_API_FN(DestroyPipeline)(rt_pipeline_handle);
 extern rt_render_target_handle
     RT_RENDERER_API_FN(CreateRenderTarget)(const rt_render_target_info *);
 extern void RT_RENDERER_API_FN(DestroyRenderTarget)(rt_render_target_handle);
-extern rt_result
-    RT_RENDERER_API_FN(AllocCommandBuffers)(uint32_t count,
-                                            rt_render_command_buffer_handle *p_command_buffers,
-                                            rt_gpu_semaphore_handle *p_semaphores);
-extern rt_result RT_RENDERER_API_FN(
-    SubmitCommandBuffers)(uint32_t count, const rt_render_command_buffer_handle *command_buffers);
+extern rt_result RT_RENDERER_API_FN(AllocCommandBuffers)(uint32_t,
+                                                         const rt_alloc_command_buffer_info *,
+                                                         rt_command_buffer_handle *);
+extern rt_result RT_RENDERER_API_FN(SubmitCommandBuffers)(rt_gpu_queue,
+                                                          const rt_submit_command_buffers_info *);
 #endif
 
 extern rt_result InitFramegraphManager(void);
@@ -62,6 +62,7 @@ static bool LoadRenderer(void) {
         RETRIEVE_SYMBOL(RegisterCVars, rt_register_renderer_cvars_fn);
         RETRIEVE_SYMBOL(Init, rt_init_renderer_fn);
         RETRIEVE_SYMBOL(Shutdown, rt_shutdown_renderer_fn);
+        RETRIEVE_SYMBOL(BeginFrame, rt_begin_frame_fn);
         RETRIEVE_SYMBOL(CompilePipeline, rt_compile_pipeline_fn);
         RETRIEVE_SYMBOL(DestroyPipeline, rt_destroy_pipeline_fn);
         RETRIEVE_SYMBOL(CreateRenderTarget, rt_create_render_target_fn);
@@ -80,6 +81,7 @@ static bool LoadRenderer(void) {
     g_renderer.RegisterCVars        = &rtRenRegisterCVars;
     g_renderer.Init                 = &rtRenInit;
     g_renderer.Shutdown             = &rtRenShutdown;
+    g_renderer.BeginFrame           = &rtRenBeginFrame;
     g_renderer.CompilePipeline      = &rtRenCompilePipeline;
     g_renderer.DestroyPipeline      = &rtRenDestroyPipeline;
     g_renderer.CreateRenderTarget   = &rtRenCreateRenderTarget;
@@ -121,3 +123,7 @@ RT_DLLEXPORT void rtShutdownGFX(void) {
     ShutdownFramegraphManager();
     g_renderer.Shutdown();
 }
+
+RT_DLLEXPORT void rtBeginGFXFrame(unsigned int frame_id) {
+    g_renderer.BeginFrame(frame_id);
+}
\ No newline at end of file
diff --git a/src/runtime/main_loop.c b/src/runtime/main_loop.c
index f6c46bb..0e576cd 100644
--- a/src/runtime/main_loop.c
+++ b/src/runtime/main_loop.c
@@ -2,6 +2,7 @@
 #include "main_loop.h"
 #include "runtime.h"
 #include "config.h"
+#include "gfx.h"
 
 RT_CVAR_I(rt_MaxFrameLatency, "Maximum latency between update and rendering. Default: 2", 2);
 
@@ -16,11 +17,11 @@ void UpdateThreadEntry(void *param) {
     while (!g_main_loop.shutdown) {
         /* Wait until the render thread has catched up */
         rtWaitOnSemaphore(&g_main_loop.update_proceed);
-        rtLog("UT", "Processing %d", g_main_loop.u_frame_id);
+        rtLog("UT", "Processing %u", g_main_loop.u_frame_id);
 
         (g_main_loop.GameUpdate)();
 
-        rtLog("UT", "Finished   %d", g_main_loop.u_frame_id);
+        rtLog("UT", "Finished   %u", g_main_loop.u_frame_id);
         g_main_loop.u_frame_id += 1;
         /* Signal the render thread that data is available */
         rtSignalSemaphore(&g_main_loop.render_proceed);
@@ -35,11 +36,12 @@ void RenderThreadEntry(void *param) {
     rtLog("RT", "RenderThread Entry");
     while (!g_main_loop.shutdown) {
         rtWaitOnSemaphore(&g_main_loop.render_proceed);
-        rtLog("RT", "Processing %d", g_main_loop.r_frame_id);
+        rtLog("RT", "Processing %u", g_main_loop.r_frame_id);
 
+        rtBeginGFXFrame(g_main_loop.r_frame_id);
         (g_main_loop.GameRender)();
 
-        rtLog("RT", "Finished   %d", g_main_loop.r_frame_id);
+        rtLog("RT", "Finished   %u", g_main_loop.r_frame_id);
         g_main_loop.r_frame_id += 1;
         /* Signal the update thread that we have finished and it can proceed */
         rtSignalSemaphore(&g_main_loop.update_proceed);
diff --git a/src/runtime/main_loop.h b/src/runtime/main_loop.h
index 8960344..8bb3771 100644
--- a/src/runtime/main_loop.h
+++ b/src/runtime/main_loop.h
@@ -8,8 +8,8 @@ typedef void rt_main_loop_update_fn(void);
 typedef void rt_main_loop_render_fn(void);
 
 typedef struct {
-    int u_frame_id;
-    int r_frame_id;
+    unsigned int u_frame_id;
+    unsigned int r_frame_id;
 
     rt_semaphore update_proceed;
     rt_semaphore render_proceed;
diff --git a/src/runtime/meson.build b/src/runtime/meson.build
index 8f4d0aa..0896abe 100644
--- a/src/runtime/meson.build
+++ b/src/runtime/meson.build
@@ -5,6 +5,7 @@ runtime_lib = library('rt',
   # Project Sources
   'aio.h',
   'app.h',
+  'atomics.h',
   'buffer_manager.h',
   'compression.h',
   'config.h',
diff --git a/src/runtime/renderer_api.h b/src/runtime/renderer_api.h
index 8604a0d..e8655fa 100644
--- a/src/runtime/renderer_api.h
+++ b/src/runtime/renderer_api.h
@@ -13,6 +13,25 @@
 extern "C" {
 #endif
 
+/* Handles for backend objects */
+
+#define RT_RENDER_BACKEND_HANDLE_MAX_VERSION 255
+
+#define RT_RENDER_BACKEND_HANDLE(name)                                                             \
+    typedef struct {                                                                               \
+        uint32_t version : 8;                                                                      \
+        uint32_t index : 24;                                                                       \
+    } name
+
+RT_RENDER_BACKEND_HANDLE(rt_pipeline_handle);
+RT_RENDER_BACKEND_HANDLE(rt_render_target_handle);
+RT_RENDER_BACKEND_HANDLE(rt_command_buffer_handle);
+RT_RENDER_BACKEND_HANDLE(rt_gpu_semaphore_handle);
+
+#undef RT_RENDER_BACKEND_HANDLE
+
+/* Init data for the renderer */
+
 #ifdef _WIN32
 struct HINSTANCE__;
 struct HWND__;
@@ -30,6 +49,14 @@ struct rt_renderer_init_info_s {
 #endif
 };
 
+/* Argument types for render commands */
+
+typedef enum {
+    RT_GRAPHICS_QUEUE,
+    RT_COMPUTE_QUEUE,
+    RT_TRANSFER_QUEUE,
+} rt_gpu_queue;
+
 typedef struct {
     rt_resource_id vertex_shader;
     rt_resource_id fragment_shader;
@@ -86,41 +113,39 @@ typedef struct {
     size_t bytecode_length;
 } rt_shader_info;
 
-/* Handles for backend objects */
+typedef struct {
+    rt_gpu_queue target_queue;
+} rt_alloc_command_buffer_info;
 
-#define RT_RENDER_BACKEND_HANDLE_MAX_VERSION 255
+typedef struct {
+    const rt_command_buffer_handle *command_buffers;
+    const rt_gpu_semaphore_handle *wait_semaphores;
+    const rt_gpu_semaphore_handle *signal_semaphores;
+    uint32_t command_buffer_count;
+    uint32_t wait_semaphore_count;
+    uint32_t signal_semaphore_count;
+} rt_submit_command_buffers_info;
 
-#define RT_RENDER_BACKEND_HANDLE(name)                                                             \
-    typedef struct {                                                                               \
-        uint32_t version : 8;                                                                      \
-        uint32_t index : 24;                                                                       \
-    } name
-
-RT_RENDER_BACKEND_HANDLE(rt_pipeline_handle);
-RT_RENDER_BACKEND_HANDLE(rt_render_target_handle);
-RT_RENDER_BACKEND_HANDLE(rt_render_command_buffer_handle);
-RT_RENDER_BACKEND_HANDLE(rt_gpu_semaphore_handle);
-
-#undef RT_RENDER_BACKEND_HANDLE
+/* Renderer API */
 
 typedef void rt_register_renderer_cvars_fn(void);
 typedef rt_result rt_init_renderer_fn(const rt_renderer_init_info *info);
 typedef void rt_shutdown_renderer_fn(void);
+typedef void rt_begin_frame_fn(unsigned int frame_id);
 typedef rt_pipeline_handle rt_compile_pipeline_fn(const rt_pipeline_info *info);
 typedef void rt_destroy_pipeline_fn(rt_pipeline_handle handle);
 typedef rt_render_target_handle rt_create_render_target_fn(const rt_render_target_info *info);
 typedef void rt_destroy_render_target_fn(rt_render_target_handle handle);
 typedef rt_result rt_alloc_command_buffers_fn(uint32_t count,
-                                              rt_render_command_buffer_handle *p_command_buffers,
-                                              rt_gpu_semaphore_handle *p_semaphores);
-typedef rt_result
-rt_submit_command_buffers_fn(uint32_t count,
-                             const rt_render_command_buffer_handle *command_buffers);
+                                              const rt_alloc_command_buffer_info *info,
+                                              rt_command_buffer_handle *p_command_buffers);
+typedef rt_result rt_submit_command_buffers_fn(rt_gpu_queue queue, const rt_submit_command_buffers_info *info);
 
 typedef struct {
     rt_register_renderer_cvars_fn *RegisterCVars;
     rt_init_renderer_fn *Init;
     rt_shutdown_renderer_fn *Shutdown;
+    rt_begin_frame_fn *BeginFrame;
     rt_compile_pipeline_fn *CompilePipeline;
     rt_destroy_pipeline_fn *DestroyPipeline;
     rt_create_render_target_fn *CreateRenderTarget;
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
index 7a7f2b0..23a4c58 100644
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -28,6 +28,8 @@ extern "C" {
 #define RT_UNUSED(x)      ((void)sizeof((x)))
 #define RT_ARRAY_COUNT(x) (sizeof((x)) / sizeof((x)[0]))
 
+#define RT_RESTRICT_VALUE_TO_BOUNDS(v, lower, upper) (((v) < (lower)) ? (lower) : (((v) > (upper)) ? (upper) : (v)))
+
 #define RT_KB(n) ((n)*1024U)
 #define RT_MB(n) ((n)*1024U * 1024U)
 #define RT_GB(n) ((n)*1024U * 1024U * 1024U)