#pragma once

#include <SDL3/SDL_gpu.h>
#include <cstdint>

// ---------------------------------------------------------------------------
// BallGPUData — 32-byte per-instance record stored in VRAM.
// Positions and sizes pre-converted to NDC space on CPU so the vertex shader
// needs no screen-dimension uniform.
//   cx, cy : NDC center   (cx = (x + w/2)/sw*2-1,  cy = 1-(y+h/2)/sh*2)
//   hw, hh : NDC half-size (hw = w/sw,  hh = h/sh, both positive)
//   r,g,b,a: RGBA in [0,1]
// ---------------------------------------------------------------------------
struct BallGPUData {
    float cx, cy;      // NDC center
    float hw, hh;      // NDC half-size (positive)
    float r, g, b, a;  // RGBA color [0,1]
};
static_assert(sizeof(BallGPUData) == 32, "BallGPUData must be 32 bytes");

// ============================================================================
// GpuBallBuffer — owns the GPU vertex buffer used for instanced ball rendering.
//
// Usage per frame:
//   buffer.upload(device, cmd, data, count);  // inside a copy pass
//   // Then in render pass: bind buffer, SDL_DrawGPUPrimitives(pass, 6, count, 0, 0)
// ============================================================================
class GpuBallBuffer {
public:
    static constexpr int MAX_BALLS = 500000;

    bool init(SDL_GPUDevice* device);
    void destroy(SDL_GPUDevice* device);

    // Upload ball array to GPU via an internal copy pass.
    // count is clamped to MAX_BALLS.  Returns false on error or empty input.
    bool upload(SDL_GPUDevice* device, SDL_GPUCommandBuffer* cmd,
                const BallGPUData* data, int count);

    SDL_GPUBuffer* buffer() const { return gpu_buf_; }
    int count() const { return count_; }

private:
    SDL_GPUBuffer*         gpu_buf_      = nullptr;
    SDL_GPUTransferBuffer* transfer_buf_ = nullptr;
    int                    count_        = 0;
};