#include "config.h"
#include "gfx.h"
#include "handles.h"
#include "hashing.h"
#include "mem_arena.h"
#include "renderer_api.h"
#include "threading.h"

#include <stdbool.h>
#include <stdlib.h>
#include <string.h>

RT_CVAR_I(rt_MaxFramegraphs, "Maximum number of framegraphs. Default 16", 16);

#define RT_FRAMEGRAPH_MAX_PASSES         32
#define RT_FRAMEGRAPH_MAX_RENDER_TARGETS 32
#define RT_RENDERPASS_MAX_READS          8
#define RT_RENDERPASS_MAX_WRITES         8

typedef struct {
    rt_render_target_id id;
    rt_pixel_format format;
    unsigned int width;
    unsigned int height;
    unsigned int sample_count;
    rt_render_target_handle api_render_target;
} rt_render_target;

typedef struct {
    rt_render_pass_id id;
    int execution_level;
    unsigned int read_count;
    unsigned int write_count;
    rt_render_pass_bind_fns bound_fns;
    rt_render_target_read reads[RT_RENDERPASS_MAX_READS];
    rt_render_target_write writes[RT_RENDERPASS_MAX_WRITES];
} rt_render_pass;

struct rt_framegraph_s {
    uint32_t pass_count;
    uint32_t render_target_count;

    rt_framegraph *next_free;

    rt_render_pass passes[RT_FRAMEGRAPH_MAX_PASSES];

    rt_render_target render_targets[RT_FRAMEGRAPH_MAX_RENDER_TARGETS];
};

static rt_framegraph *_framegraphs;
static rt_framegraph *_first_free;
static rt_mutex *_free_list_lock;

static void ReturnFrameGraph(rt_framegraph *framegraph) {
    rtLockMutex(_free_list_lock);
    framegraph->next_free = _first_free;
    _first_free           = framegraph;
    rtUnlockMutex(_free_list_lock);
}

rt_result InitFramegraphManager(void) {
    _free_list_lock = rtCreateMutex();
    if (!_free_list_lock)
        return RT_UNKNOWN_ERROR;
    _framegraphs = calloc((size_t)rt_MaxFramegraphs.i, sizeof(rt_framegraph));
    if (!_framegraphs)
        return RT_OUT_OF_MEMORY;
    for (int i = 0; i < rt_MaxFramegraphs.i; ++i)
        _framegraphs[i].next_free = (i < rt_MaxFramegraphs.i - 1) ? &_framegraphs[i + 1] : NULL;
    _first_free = &_framegraphs[0];
    return RT_SUCCESS;
}

void ShutdownFramegraphManager(void) {
    free(_framegraphs);
    rtDestroyMutex(_free_list_lock);
}

typedef struct {
    unsigned int dependency_count;
    int execution_level;
} rt_pass_construct;

static int CompareRenderPassExecutionLevels(const void *a, const void *b) {
    const rt_render_pass *pass_a = a, *pass_b = b;
    return pass_a->execution_level - pass_b->execution_level;
}

static bool
CreateRenderPasses(rt_framegraph *graph, const rt_framegraph_info *info, rt_arena *arena) {
    uint32_t render_pass_count = info->render_pass_count;

    bool result = false;

    /* Pass A depends on pass B, if:
     * B preceeds A in the list of render passes AND
     * B writes to a render target that A reads from. */
    bool *dependency_matrix =
        rtArenaPushZero(arena, render_pass_count * render_pass_count * sizeof(bool));
    if (!dependency_matrix) {
        rtLog("GFX",
              "Not enough memory to allocate a %ux%u dependency matrix.",
              render_pass_count,
              render_pass_count);
        goto out;
    }
    /* Checks if pass "dependent_idx" depends on pass "dependency_idx" */
#define PASS_DEPENDS(dependent_idx, dependency_idx)                                                \
    dependency_matrix[(dependency_idx)*render_pass_count + (dependent_idx)]

    rt_pass_construct *construct_passes =
        RT_ARENA_PUSH_ARRAY_ZERO(arena, rt_pass_construct, render_pass_count);
    if (!construct_passes) {
        rtLog("GFX",
              "Not enough memory to allocate construction information for %u passes.",
              render_pass_count);
        goto out;
    }

    const rt_render_pass_info *pass_info = rtResolveConstRelptr(&info->render_passes);
    for (uint32_t i = 0; i < render_pass_count; ++i) {
        construct_passes[i].execution_level = -1; /* not scheduled yet */
        const rt_render_target_write *writes_i =
            rtResolveConstRelptr(&pass_info[i].write_render_targets);
        for (uint32_t j = i + 1; j < render_pass_count; ++j) {
            const rt_render_target_read *reads_j =
                rtResolveConstRelptr(&pass_info[j].read_render_targets);
            bool depends = false;
            for (uint32_t read_idx = 0; read_idx < pass_info[j].read_render_target_count;
                 ++read_idx) {
                for (uint32_t write_idx = 0; write_idx < pass_info[i].write_render_target_count;
                     ++write_idx) {
                    if (writes_i[write_idx].render_target == reads_j[read_idx].render_target)
                        depends = true;
                }
            }
            PASS_DEPENDS(j, i) = depends;
            if (depends)
                ++construct_passes[j].dependency_count;
        }
    }

    /* Pass A can be executed concurrently with pass B if:
     * 1. A and B don't write to the same render target AND
     * 2. A's dependencies and B's dependencies have finished executing. */

    /* We can have at most render_pass_count execution levels */
    uint32_t *level_passes = RT_ARENA_PUSH_ARRAY_ZERO(arena, uint32_t, render_pass_count);
    if (!level_passes) {
        rtLog("GFX", "Failed to allocate a temporary array for constructing execution levels.");
        goto out;
    }
    uint32_t unscheduled_passes = render_pass_count;
    for (int level = 0; level < (int)render_pass_count; ++level) {
        unsigned int level_pass_count = 0;
        for (uint32_t i = 0; i < render_pass_count; ++i) {
            if (construct_passes[i].execution_level == -1 &&
                construct_passes[i].dependency_count == 0) {

                /* Check that no writes conflict */
                bool write_conflict = false;
                const rt_render_target_write *writes_i =
                    rtResolveConstRelptr(&pass_info[i].write_render_targets);
                for (unsigned int j = 0; j < level_pass_count; ++j) {
                    uint32_t pass_idx = level_passes[i];
                    const rt_render_target_write *pass_writes =
                        rtResolveConstRelptr(&pass_info[pass_idx].write_render_targets);
                    for (uint32_t k = 0; k < pass_info[i].write_render_target_count; ++k) {
                        for (uint32_t l = 0; l < pass_info[pass_idx].write_render_target_count;
                             ++l) {
                            if (writes_i[k].render_target == pass_writes[l].render_target) {
                                write_conflict = true;
                                break;
                            }
                        }
                        if (write_conflict)
                            break;
                    }
                    if (write_conflict)
                        break;
                }
                if (!write_conflict) {
                    RT_ASSERT(level_pass_count < render_pass_count, "");
                    level_passes[level_pass_count++]    = i;
                    construct_passes[i].execution_level = level;
                }
            }
        }
        if (level_pass_count == 0) {
            rtLog("GFX", "Failed to compute a valid schedule for the provided framegraph.");
            goto out;
        }
        /* level passes now contains the passes we can execute concurrently.
         * Decrement dependency count for all passes that depend on a pass in this level */

        for (uint32_t i = 0; i < level_pass_count; ++i) {
            for (uint32_t j = 0; j < render_pass_count; ++j) {
                if (PASS_DEPENDS(j, level_passes[i]))
                    --construct_passes[j].dependency_count;
            }
        }

        unscheduled_passes -= level_pass_count;
        if (unscheduled_passes == 0)
            break;
    }
    RT_ASSERT(unscheduled_passes == 0, "Did not schedule all passes");
    /* Construct passes now contains the "execution level" for each pass.
     * We execute passes in that order, those with the same execution level can be executed
     * concurrently. */

    graph->pass_count = render_pass_count;
    for (uint32_t i = 0; i < render_pass_count; ++i) {
        graph->passes[i].execution_level = construct_passes[i].execution_level;
        const rt_render_target_write *writes =
            rtResolveConstRelptr(&pass_info[i].write_render_targets);
        const rt_render_target_read *reads =
            rtResolveConstRelptr(&pass_info[i].read_render_targets);
        memcpy(graph->passes[i].writes,
               writes,
               pass_info[i].write_render_target_count * sizeof(rt_render_target_write));
        memcpy(graph->passes[i].reads,
               reads,
               pass_info[i].read_render_target_count * sizeof(rt_render_target_read));
        graph->passes[i].write_count = pass_info[i].write_render_target_count;
        graph->passes[i].read_count  = pass_info[i].read_render_target_count;
        graph->passes[i].id          = pass_info[i].id;
    }

    /* Sort by execution level */
    qsort(graph->passes,
          render_pass_count,
          sizeof(rt_render_pass),
          CompareRenderPassExecutionLevels);
    result = true;
out:
    return result;
#undef PASS_DEPENDS
}

static bool
CreateRenderTargets(rt_framegraph *graph, const rt_framegraph_info *info, rt_arena *arena) {
    bool result = false;

    /* TODO(Kevin): determine aliasing opportunities */
    const rt_render_target_info *render_targets = rtResolveConstRelptr(&info->render_targets);
    for (uint32_t i = 0; i < info->render_target_count; ++i) {
        graph->render_targets[i].id           = render_targets[i].id;
        graph->render_targets[i].format       = render_targets[i].format;
        graph->render_targets[i].width        = render_targets[i].width;
        graph->render_targets[i].height       = render_targets[i].height;
        graph->render_targets[i].sample_count = render_targets[i].sample_count;
        graph->render_targets[i].api_render_target =
            g_renderer.CreateRenderTarget(&render_targets[i]);
        if (!RT_IS_HANDLE_VALID(graph->render_targets[i].api_render_target)) {
            rtReportError("GFX", "Failed to create render target %u of framegraph.", i);
            for (uint32_t j = 0; j < i; ++j)
                g_renderer.DestroyRenderTarget(graph->render_targets[j].api_render_target);
            goto out;
        }
    }

    result = true;
out:
    return result;
}

static bool ValidateInfo(const rt_framegraph_info *info) {
    if (info->render_pass_count > RT_FRAMEGRAPH_MAX_PASSES) {
        rtReportError("GFX",
                      "Framegraph has too many passes: %u (maximum allowed is %u)",
                      info->render_pass_count,
                      RT_FRAMEGRAPH_MAX_PASSES);
        return false;
    }
    if (info->render_target_count > RT_FRAMEGRAPH_MAX_RENDER_TARGETS) {
        rtReportError("GFX",
                      "Framegraph has too many render targets: %u (maximum allowed is %u)",
                      info->render_target_count,
                      RT_FRAMEGRAPH_MAX_RENDER_TARGETS);
        return false;
    }

    const rt_render_target_info *render_targets = rtResolveConstRelptr(&info->render_targets);
    for (uint32_t i = 0; i < info->render_target_count; ++i) {
        if (render_targets[i].id == 0) {
            rtReportError("GFX", "Framegraph render target %u has invalid id 0", i);
            return false;
        } else if ((render_targets[i].width == RT_RENDER_TARGET_SIZE_SWAPCHAIN ||
                    render_targets[i].height == RT_RENDER_TARGET_SIZE_SWAPCHAIN) &&
                   (render_targets[i].width != render_targets[i].height)) {
            rtReportError("GFX",
                          "Framegraph render target %u: If width or height is set to "
                          "SWAPCHAIN, both values must be set to SWAPCHAIN.",
                          i);
            return false;
        } else if (render_targets[i].format >= RT_PIXEL_FORMAT_count) {
            rtReportError("GFX",
                          "Framegraph render target %u format is outside the allowed range.",
                          i);
            return false;
        }
    }

    const rt_render_pass_info *passes = rtResolveConstRelptr(&info->render_passes);
    for (uint32_t i = 0; i < info->render_pass_count; ++i) {
        if (passes[i].id == 0) {
            rtReportError("GFX", "Framegraph pass %u has invalid id 0", i);
            return false;
        } else if (passes[i].read_render_target_count > RT_RENDERPASS_MAX_READS) {
            rtReportError(
                "GFX",
                "Framegraph pass %u reads too many rendertargets: %u (maximum allowed is %u)",
                i,
                passes[i].read_render_target_count,
                RT_RENDERPASS_MAX_READS);
            return false;
        } else if (passes[i].write_render_target_count > RT_RENDERPASS_MAX_WRITES) {
            rtReportError(
                "GFX",
                "Framegraph pass %u writes too many rendertargets: %u (maximum allowed is %u)",
                i,
                passes[i].write_render_target_count,
                RT_RENDERPASS_MAX_WRITES);
            return false;
        }
    }

    return true;
}

RT_DLLEXPORT rt_framegraph *rtCreateFramegraph(const rt_framegraph_info *info) {
    if (!ValidateInfo(info)) {
        return NULL;
    }

    rt_temp_arena temp = rtGetTemporaryArena(NULL, 0);
    if (!temp.arena) {
        rtReportError("GFX", "Failed to acquire a temporary arena for constructing a framegraph");
        return NULL;
    }

    rt_framegraph *graph = NULL;
    /* Acquire a unused framegraph */
    rtLockMutex(_free_list_lock);
    graph = _first_free;
    if (graph)
        _first_free = graph->next_free;
    rtUnlockMutex(_free_list_lock);
    if (!graph)
        goto out;
    memset(graph, 0, sizeof(*graph));

    if (!CreateRenderPasses(graph, info, temp.arena)) {
        ReturnFrameGraph(graph);
        graph = NULL;
        goto out;
    }

    if (!CreateRenderTargets(graph, info, temp.arena)) {
        ReturnFrameGraph(graph);
        graph = NULL;
        goto out;
    }

out:
    rtReturnTemporaryArena(temp);
    return graph;
}

RT_DLLEXPORT void rtDestroyFramegraph(rt_framegraph *framegraph) {
    ReturnFrameGraph(framegraph);
}

RT_DLLEXPORT void rtBindRenderPass(rt_framegraph *framegraph,
                                   rt_render_pass_id id,
                                   const rt_render_pass_bind_fns *bind_fns) {
    for (uint32_t i = 0; i < framegraph->pass_count; ++i) {
        if (framegraph->passes[i].id == id) {
            if (framegraph->passes[i].bound_fns.Execute)
                rtLog("GFX", "Rebound pass %x to new functions", id);
            framegraph->passes[i].bound_fns = *bind_fns;
            return;
        }
    }
    rtLog("GFX", "Tried to bind functions to unknown render pass %x", id);
}

RT_DLLEXPORT void rtExecuteFramegraph(rt_framegraph *framegraph) {
    int execution_level  = framegraph->passes[0].execution_level;
    uint32_t level_start = 0;

    for (uint32_t i = 0; i <= framegraph->pass_count && level_start < framegraph->pass_count; ++i) {
        if ((i == framegraph->pass_count) ||
            (framegraph->passes[i].execution_level > execution_level)) {
            /* Dispatch all passes in the current execution level */
            for (uint32_t pass_idx = level_start; pass_idx < i; ++pass_idx) {
                bool pass_bound = framegraph->passes[pass_idx].bound_fns.Prepare != NULL &&
                                  framegraph->passes[pass_idx].bound_fns.Execute != NULL &&
                                  framegraph->passes[pass_idx].bound_fns.Finalize != NULL;
                if (!pass_bound) {
                    rtLog("GFX",
                          "Framegraph pass %u (%x) is not bound to any function.",
                          pass_idx,
                          framegraph->passes[pass_idx].id);
                    continue;
                }
                rt_render_pass_id id                 = framegraph->passes[pass_idx].id;
                const rt_render_target_write *writes = framegraph->passes[pass_idx].writes;
                const rt_render_target_read *reads   = framegraph->passes[pass_idx].reads;
                uint32_t write_count                 = framegraph->passes[pass_idx].write_count;
                uint32_t read_count                  = framegraph->passes[pass_idx].read_count;

                /* TODO(Kevin): Every one of these should be a job-dispatch*/

                framegraph->passes[pass_idx].bound_fns.Prepare(id,
                                                               writes,
                                                               write_count,
                                                               reads,
                                                               read_count);
                framegraph->passes[pass_idx].bound_fns.Execute(id,
                                                               writes,
                                                               write_count,
                                                               reads,
                                                               read_count);
                framegraph->passes[pass_idx].bound_fns.Finalize(id,
                                                                writes,
                                                                write_count,
                                                                reads,
                                                                read_count);
            }

            /* Start next level */
            level_start = i;
            if (i < framegraph->pass_count)
                execution_level = framegraph->passes[i].execution_level;
        }
    }
}

RT_DLLEXPORT rt_render_target_id rtCalculateRenderTargetID(const char *name, size_t len) {
    rt_render_target_id id = rtHashBytes32(name, len);
    if (id == 0)
        id = ~id;
    return id;
}

RT_DLLEXPORT rt_render_pass_id rtCalculateRenderPassID(const char *name, size_t len) {
    rt_render_pass_id id = rtHashBytes32(name, len);
    if (id == 0)
        id = ~id;
    return id;
}