#include "gpu_sprite_batch.hpp"

#include <SDL3/SDL_log.h>
#include <cstring>  // memcpy

// ---------------------------------------------------------------------------
// Public interface
// ---------------------------------------------------------------------------

bool GpuSpriteBatch::init(SDL_GPUDevice* device) {
    // Pre-allocate GPU buffers large enough for MAX_SPRITES quads.
    Uint32 max_verts   = static_cast<Uint32>(MAX_SPRITES) * 4;
    Uint32 max_indices = static_cast<Uint32>(MAX_SPRITES) * 6;

    Uint32 vb_size = max_verts   * sizeof(GpuVertex);
    Uint32 ib_size = max_indices * sizeof(uint32_t);

    // Vertex buffer
    SDL_GPUBufferCreateInfo vb_info = {};
    vb_info.usage = SDL_GPU_BUFFERUSAGE_VERTEX;
    vb_info.size  = vb_size;
    vertex_buf_ = SDL_CreateGPUBuffer(device, &vb_info);
    if (!vertex_buf_) {
        SDL_Log("GpuSpriteBatch: vertex buffer creation failed: %s", SDL_GetError());
        return false;
    }

    // Index buffer
    SDL_GPUBufferCreateInfo ib_info = {};
    ib_info.usage = SDL_GPU_BUFFERUSAGE_INDEX;
    ib_info.size  = ib_size;
    index_buf_ = SDL_CreateGPUBuffer(device, &ib_info);
    if (!index_buf_) {
        SDL_Log("GpuSpriteBatch: index buffer creation failed: %s", SDL_GetError());
        return false;
    }

    // Transfer buffers (reused every frame via cycle=true on upload)
    SDL_GPUTransferBufferCreateInfo tb_info = {};
    tb_info.usage = SDL_GPU_TRANSFERBUFFERUSAGE_UPLOAD;

    tb_info.size     = vb_size;
    vertex_transfer_ = SDL_CreateGPUTransferBuffer(device, &tb_info);
    if (!vertex_transfer_) {
        SDL_Log("GpuSpriteBatch: vertex transfer buffer failed: %s", SDL_GetError());
        return false;
    }

    tb_info.size    = ib_size;
    index_transfer_ = SDL_CreateGPUTransferBuffer(device, &tb_info);
    if (!index_transfer_) {
        SDL_Log("GpuSpriteBatch: index transfer buffer failed: %s", SDL_GetError());
        return false;
    }

    vertices_.reserve(MAX_SPRITES * 4);
    indices_.reserve(MAX_SPRITES * 6);
    return true;
}

void GpuSpriteBatch::destroy(SDL_GPUDevice* device) {
    if (!device) return;
    if (vertex_transfer_) { SDL_ReleaseGPUTransferBuffer(device, vertex_transfer_); vertex_transfer_ = nullptr; }
    if (index_transfer_)  { SDL_ReleaseGPUTransferBuffer(device, index_transfer_);  index_transfer_  = nullptr; }
    if (vertex_buf_)      { SDL_ReleaseGPUBuffer(device, vertex_buf_);              vertex_buf_      = nullptr; }
    if (index_buf_)       { SDL_ReleaseGPUBuffer(device, index_buf_);               index_buf_       = nullptr; }
}

void GpuSpriteBatch::beginFrame() {
    vertices_.clear();
    indices_.clear();
    bg_index_count_       = 0;
    sprite_index_offset_  = 0;
    sprite_index_count_   = 0;
    overlay_index_offset_ = 0;
    overlay_index_count_  = 0;
}

void GpuSpriteBatch::addBackground(float screen_w, float screen_h,
                                    float top_r, float top_g, float top_b,
                                    float bot_r, float bot_g, float bot_b) {
    // Background is the full screen quad, corners:
    //   TL(-1, 1)  TR(1, 1)   → top color
    //   BL(-1,-1)  BR(1,-1)   → bottom color
    // We push it as 4 separate vertices (different colors per row).
    uint32_t vi = static_cast<uint32_t>(vertices_.size());

    // Top-left
    vertices_.push_back({ -1.0f,  1.0f, 0.0f, 0.0f, top_r, top_g, top_b, 1.0f });
    // Top-right
    vertices_.push_back({  1.0f,  1.0f, 1.0f, 0.0f, top_r, top_g, top_b, 1.0f });
    // Bottom-right
    vertices_.push_back({  1.0f, -1.0f, 1.0f, 1.0f, bot_r, bot_g, bot_b, 1.0f });
    // Bottom-left
    vertices_.push_back({ -1.0f, -1.0f, 0.0f, 1.0f, bot_r, bot_g, bot_b, 1.0f });

    // Two triangles: TL-TR-BR, BR-BL-TL
    indices_.push_back(vi + 0); indices_.push_back(vi + 1); indices_.push_back(vi + 2);
    indices_.push_back(vi + 2); indices_.push_back(vi + 3); indices_.push_back(vi + 0);

    bg_index_count_      = 6;
    sprite_index_offset_ = 6;

    (void)screen_w; (void)screen_h;  // unused — bg always covers full NDC
}

void GpuSpriteBatch::addSprite(float x, float y, float w, float h,
                                float r, float g, float b, float a,
                                float scale,
                                float screen_w, float screen_h) {
    // Apply scale around the sprite centre
    float scaled_w  = w * scale;
    float scaled_h  = h * scale;
    float offset_x  = (w - scaled_w) * 0.5f;
    float offset_y  = (h - scaled_h) * 0.5f;

    float px0 = x + offset_x;
    float py0 = y + offset_y;
    float px1 = px0 + scaled_w;
    float py1 = py0 + scaled_h;

    float ndx0, ndy0, ndx1, ndy1;
    toNDC(px0, py0, screen_w, screen_h, ndx0, ndy0);
    toNDC(px1, py1, screen_w, screen_h, ndx1, ndy1);

    pushQuad(ndx0, ndy0, ndx1, ndy1, 0.0f, 0.0f, 1.0f, 1.0f, r, g, b, a);
    sprite_index_count_ += 6;
}

void GpuSpriteBatch::addFullscreenOverlay() {
    overlay_index_offset_ = static_cast<int>(indices_.size());
    pushQuad(-1.0f, 1.0f, 1.0f, -1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f);
    overlay_index_count_  = 6;
}

bool GpuSpriteBatch::uploadBatch(SDL_GPUDevice* device, SDL_GPUCommandBuffer* cmd_buf) {
    if (vertices_.empty()) return false;

    Uint32 vb_size = static_cast<Uint32>(vertices_.size() * sizeof(GpuVertex));
    Uint32 ib_size = static_cast<Uint32>(indices_.size()  * sizeof(uint32_t));

    // Map → write → unmap transfer buffers
    void* vp = SDL_MapGPUTransferBuffer(device, vertex_transfer_, true /* cycle */);
    if (!vp) { SDL_Log("GpuSpriteBatch: vertex map failed"); return false; }
    memcpy(vp, vertices_.data(), vb_size);
    SDL_UnmapGPUTransferBuffer(device, vertex_transfer_);

    void* ip = SDL_MapGPUTransferBuffer(device, index_transfer_, true /* cycle */);
    if (!ip) { SDL_Log("GpuSpriteBatch: index map failed"); return false; }
    memcpy(ip, indices_.data(), ib_size);
    SDL_UnmapGPUTransferBuffer(device, index_transfer_);

    // Upload via copy pass
    SDL_GPUCopyPass* copy = SDL_BeginGPUCopyPass(cmd_buf);

    SDL_GPUTransferBufferLocation v_src = { vertex_transfer_, 0 };
    SDL_GPUBufferRegion           v_dst = { vertex_buf_,      0, vb_size };
    SDL_UploadToGPUBuffer(copy, &v_src, &v_dst, true /* cycle */);

    SDL_GPUTransferBufferLocation i_src = { index_transfer_, 0 };
    SDL_GPUBufferRegion           i_dst = { index_buf_,      0, ib_size };
    SDL_UploadToGPUBuffer(copy, &i_src, &i_dst, true /* cycle */);

    SDL_EndGPUCopyPass(copy);
    return true;
}

// ---------------------------------------------------------------------------
// Private helpers
// ---------------------------------------------------------------------------

void GpuSpriteBatch::toNDC(float px, float py,
                             float screen_w, float screen_h,
                             float& ndx, float& ndy) const {
    ndx = (px / screen_w) * 2.0f - 1.0f;
    ndy = 1.0f - (py / screen_h) * 2.0f;
}

void GpuSpriteBatch::pushQuad(float ndx0, float ndy0, float ndx1, float ndy1,
                               float u0, float v0, float u1, float v1,
                               float r, float g, float b, float a) {
    uint32_t vi = static_cast<uint32_t>(vertices_.size());

    // TL, TR, BR, BL
    vertices_.push_back({ ndx0, ndy0, u0, v0, r, g, b, a });
    vertices_.push_back({ ndx1, ndy0, u1, v0, r, g, b, a });
    vertices_.push_back({ ndx1, ndy1, u1, v1, r, g, b, a });
    vertices_.push_back({ ndx0, ndy1, u0, v1, r, g, b, a });

    indices_.push_back(vi + 0); indices_.push_back(vi + 1); indices_.push_back(vi + 2);
    indices_.push_back(vi + 2); indices_.push_back(vi + 3); indices_.push_back(vi + 0);
}