refactor(gpu): eliminar GPU compute boids (prevé crash macOS)

Elimina el kernel Metal O(N²) de boids en GPU que causava GPU timeout a macOS amb >50K boles, arrossegant WindowServer fins al crash. - Elimina gpu_boid_buffer.hpp/cpp (GpuBoidBuffer, BallComputeData, BoidParams) - Elimina kBoidComputeMSL i kBallComputeVertMSL de gpu_pipeline - Elimina boid_compute_pipeline_ i ball_compute_pipeline_ - Elimina use_gpu_boids_, boid_params_, ball_screen_uniforms_ de Engine - Elimina syncAndExitGpuBoids() i tot el compute dispatch de render() - Mode BOIDS ara usa sempre boid_manager_ (CPU, spatial hash O(N)) i renderitza via gpu_ball_buffer_ instanced (mateix path que PHYSICS) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 08:45:01 +01:00
parent badf92420b
commit d2e7f2ff86
8 changed files with 321 additions and 41 deletions
--- a/source/gpu/gpu_ball_buffer.cpp
+++ b/source/gpu/gpu_ball_buffer.cpp
@@ -0,0 +1,65 @@
+#include "gpu_ball_buffer.hpp"
+
+#include <SDL3/SDL_log.h>
+#include <algorithm>  // std::min
+#include <cstring>    // memcpy
+
+bool GpuBallBuffer::init(SDL_GPUDevice* device) {
+    Uint32 buf_size = static_cast<Uint32>(MAX_BALLS) * sizeof(BallGPUData);
+
+    // GPU vertex buffer (instance-rate data read by the ball instanced shader)
+    SDL_GPUBufferCreateInfo buf_info = {};
+    buf_info.usage = SDL_GPU_BUFFERUSAGE_VERTEX;
+    buf_info.size  = buf_size;
+    gpu_buf_ = SDL_CreateGPUBuffer(device, &buf_info);
+    if (!gpu_buf_) {
+        SDL_Log("GpuBallBuffer: GPU buffer creation failed: %s", SDL_GetError());
+        return false;
+    }
+
+    // Transfer buffer (upload staging, cycled every frame)
+    SDL_GPUTransferBufferCreateInfo tb_info = {};
+    tb_info.usage = SDL_GPU_TRANSFERBUFFERUSAGE_UPLOAD;
+    tb_info.size  = buf_size;
+    transfer_buf_ = SDL_CreateGPUTransferBuffer(device, &tb_info);
+    if (!transfer_buf_) {
+        SDL_Log("GpuBallBuffer: transfer buffer creation failed: %s", SDL_GetError());
+        return false;
+    }
+
+    SDL_Log("GpuBallBuffer: initialized (capacity %d balls, %.1f MB VRAM)",
+            MAX_BALLS, buf_size / (1024.0f * 1024.0f));
+    return true;
+}
+
+void GpuBallBuffer::destroy(SDL_GPUDevice* device) {
+    if (!device) return;
+    if (transfer_buf_) { SDL_ReleaseGPUTransferBuffer(device, transfer_buf_); transfer_buf_ = nullptr; }
+    if (gpu_buf_)      { SDL_ReleaseGPUBuffer(device, gpu_buf_);              gpu_buf_      = nullptr; }
+    count_ = 0;
+}
+
+bool GpuBallBuffer::upload(SDL_GPUDevice* device, SDL_GPUCommandBuffer* cmd,
+                            const BallGPUData* data, int count) {
+    if (!data || count <= 0) { count_ = 0; return false; }
+    count = std::min(count, MAX_BALLS);
+
+    Uint32 upload_size = static_cast<Uint32>(count) * sizeof(BallGPUData);
+
+    void* ptr = SDL_MapGPUTransferBuffer(device, transfer_buf_, true /* cycle */);
+    if (!ptr) {
+        SDL_Log("GpuBallBuffer: transfer buffer map failed: %s", SDL_GetError());
+        return false;
+    }
+    memcpy(ptr, data, upload_size);
+    SDL_UnmapGPUTransferBuffer(device, transfer_buf_);
+
+    SDL_GPUCopyPass* copy = SDL_BeginGPUCopyPass(cmd);
+    SDL_GPUTransferBufferLocation src = { transfer_buf_, 0 };
+    SDL_GPUBufferRegion           dst = { gpu_buf_,      0, upload_size };
+    SDL_UploadToGPUBuffer(copy, &src, &dst, true /* cycle */);
+    SDL_EndGPUCopyPass(copy);
+
+    count_ = count;
+    return true;
+}
--- a/source/gpu/gpu_ball_buffer.hpp
+++ b/source/gpu/gpu_ball_buffer.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <SDL3/SDL_gpu.h>
+#include <cstdint>
+
+// ---------------------------------------------------------------------------
+// BallGPUData — 32-byte per-instance record stored in VRAM.
+// Positions and sizes pre-converted to NDC space on CPU so the vertex shader
+// needs no screen-dimension uniform.
+//   cx, cy : NDC center   (cx = (x + w/2)/sw*2-1,  cy = 1-(y+h/2)/sh*2)
+//   hw, hh : NDC half-size (hw = w/sw,  hh = h/sh, both positive)
+//   r,g,b,a: RGBA in [0,1]
+// ---------------------------------------------------------------------------
+struct BallGPUData {
+    float cx, cy;      // NDC center
+    float hw, hh;      // NDC half-size (positive)
+    float r, g, b, a;  // RGBA color [0,1]
+};
+static_assert(sizeof(BallGPUData) == 32, "BallGPUData must be 32 bytes");
+
+// ============================================================================
+// GpuBallBuffer — owns the GPU vertex buffer used for instanced ball rendering.
+//
+// Usage per frame:
+//   buffer.upload(device, cmd, data, count);  // inside a copy pass
+//   // Then in render pass: bind buffer, SDL_DrawGPUPrimitives(pass, 6, count, 0, 0)
+// ============================================================================
+class GpuBallBuffer {
+public:
+    static constexpr int MAX_BALLS = 500000;
+
+    bool init(SDL_GPUDevice* device);
+    void destroy(SDL_GPUDevice* device);
+
+    // Upload ball array to GPU via an internal copy pass.
+    // count is clamped to MAX_BALLS.  Returns false on error or empty input.
+    bool upload(SDL_GPUDevice* device, SDL_GPUCommandBuffer* cmd,
+                const BallGPUData* data, int count);
+
+    SDL_GPUBuffer* buffer() const { return gpu_buf_; }
+    int count() const { return count_; }
+
+private:
+    SDL_GPUBuffer*         gpu_buf_      = nullptr;
+    SDL_GPUTransferBuffer* transfer_buf_ = nullptr;
+    int                    count_        = 0;
+};
--- a/source/gpu/gpu_pipeline.cpp
+++ b/source/gpu/gpu_pipeline.cpp
@@ -1,8 +1,10 @@
 #include "gpu_pipeline.hpp"
-#include "gpu_sprite_batch.hpp"  // for GpuVertex layout
+#include "gpu_sprite_batch.hpp"   // for GpuVertex layout
+#include "gpu_ball_buffer.hpp"    // for BallGPUData layout

 #include <SDL3/SDL_log.h>
 #include <cstddef>  // offsetof
+#include <cstring>  // strlen

 // ============================================================================
 // MSL Shaders (Metal Shading Language, macOS)
@@ -133,6 +135,60 @@ fragment float4 postfx_fs(PostVOut                   in    [[stage_in]],
 }
 )";

+// ---------------------------------------------------------------------------
+// Ball instanced vertex shader
+// Reads BallGPUData as per-instance attributes (input_rate = INSTANCE).
+// Generates a 6-vertex quad (2 triangles) per instance using vertex_id.
+//
+// BallGPUData layout:
+//   float2 center [[attribute(0)]]  — NDC center  (cx, cy)
+//   float2 half   [[attribute(1)]]  — NDC half-size (hw, hh), both positive
+//   float4 col    [[attribute(2)]]  — RGBA [0,1]
+//
+// NDC convention (SDL / Metal): Y increases upward (+1=top, -1=bottom).
+// half.x = w/screen_w,  half.y = h/screen_h  (positive; Y is not flipped)
+// Vertex order: TL TR BL | TR BR BL  (CCW winding, standard Metal)
+// ---------------------------------------------------------------------------
+static const char* kBallInstancedVertMSL = R"(
+#include <metal_stdlib>
+using namespace metal;
+
+struct BallInstance {
+    float2 center   [[attribute(0)]];  // NDC center
+    float2 halfsize [[attribute(1)]];  // NDC half-size (both positive); 'half' is reserved in MSL
+    float4 col      [[attribute(2)]];
+};
+struct BallVOut {
+    float4 pos [[position]];
+    float2 uv;
+    float4 col;
+};
+
+vertex BallVOut ball_instanced_vs(BallInstance inst [[stage_in]],
+                                   uint vid [[vertex_id]]) {
+    // Offset signs for each of the 6 vertices (TL TR BL | TR BR BL)
+    const float2 offsets[6] = {
+        {-1.0f,  1.0f},  // TL
+        { 1.0f,  1.0f},  // TR
+        {-1.0f, -1.0f},  // BL
+        { 1.0f,  1.0f},  // TR (shared)
+        { 1.0f, -1.0f},  // BR
+        {-1.0f, -1.0f},  // BL (shared)
+    };
+    // UV: TL=(0,0) TR=(1,0) BL=(0,1) BR=(1,1)
+    const float2 uvs[6] = {
+        {0.0f, 0.0f}, {1.0f, 0.0f}, {0.0f, 1.0f},
+        {1.0f, 0.0f}, {1.0f, 1.0f}, {0.0f, 1.0f},
+    };
+    float2 pos = inst.center + offsets[vid] * inst.halfsize;
+    BallVOut out;
+    out.pos = float4(pos.x, pos.y, 0.0f, 1.0f);
+    out.uv  = uvs[vid];
+    out.col = inst.col;
+    return out;
+}
+)";
+
 // ============================================================================
 // GpuPipeline implementation
 // ============================================================================
@@ -222,6 +278,71 @@ bool GpuPipeline::init(SDL_GPUDevice* device,
        return false;
    }

+    // ----------------------------------------------------------------
+    // Ball instanced pipeline
+    // Vertex: ball_instanced_vs (BallGPUData per-instance, no index buffer)
+    // Fragment: sprite_fs (same texture+color blend as sprite pipeline)
+    // Targets: offscreen (same as sprite pipeline)
+    // ----------------------------------------------------------------
+    SDL_GPUShader* ball_vert = createShader(device, kBallInstancedVertMSL, "ball_instanced_vs",
+                                             SDL_GPU_SHADERSTAGE_VERTEX, 0, 0);
+    SDL_GPUShader* ball_frag = createShader(device, kSpriteFragMSL, "sprite_fs",
+                                             SDL_GPU_SHADERSTAGE_FRAGMENT, 1, 0);
+    if (!ball_vert || !ball_frag) {
+        SDL_Log("GpuPipeline: failed to create ball instanced shaders");
+        if (ball_vert) SDL_ReleaseGPUShader(device, ball_vert);
+        if (ball_frag) SDL_ReleaseGPUShader(device, ball_frag);
+        return false;
+    }
+
+    // Vertex input: BallGPUData as per-instance data (step rate = 1 instance)
+    SDL_GPUVertexBufferDescription ball_vb_desc = {};
+    ball_vb_desc.slot               = 0;
+    ball_vb_desc.pitch              = sizeof(BallGPUData);
+    ball_vb_desc.input_rate         = SDL_GPU_VERTEXINPUTRATE_INSTANCE;
+    ball_vb_desc.instance_step_rate = 1;
+
+    SDL_GPUVertexAttribute ball_attrs[3] = {};
+    // attr 0: center (float2) at offset 0
+    ball_attrs[0].location    = 0;
+    ball_attrs[0].buffer_slot = 0;
+    ball_attrs[0].format      = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT2;
+    ball_attrs[0].offset      = static_cast<Uint32>(offsetof(BallGPUData, cx));
+    // attr 1: half-size (float2) at offset 8
+    ball_attrs[1].location    = 1;
+    ball_attrs[1].buffer_slot = 0;
+    ball_attrs[1].format      = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT2;
+    ball_attrs[1].offset      = static_cast<Uint32>(offsetof(BallGPUData, hw));
+    // attr 2: color (float4) at offset 16
+    ball_attrs[2].location    = 2;
+    ball_attrs[2].buffer_slot = 0;
+    ball_attrs[2].format      = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT4;
+    ball_attrs[2].offset      = static_cast<Uint32>(offsetof(BallGPUData, r));
+
+    SDL_GPUVertexInputState ball_vertex_input = {};
+    ball_vertex_input.vertex_buffer_descriptions = &ball_vb_desc;
+    ball_vertex_input.num_vertex_buffers         = 1;
+    ball_vertex_input.vertex_attributes          = ball_attrs;
+    ball_vertex_input.num_vertex_attributes      = 3;
+
+    SDL_GPUGraphicsPipelineCreateInfo ball_pipe_info = {};
+    ball_pipe_info.vertex_shader      = ball_vert;
+    ball_pipe_info.fragment_shader    = ball_frag;
+    ball_pipe_info.vertex_input_state = ball_vertex_input;
+    ball_pipe_info.primitive_type     = SDL_GPU_PRIMITIVETYPE_TRIANGLELIST;
+    ball_pipe_info.target_info.num_color_targets         = 1;
+    ball_pipe_info.target_info.color_target_descriptions = &color_target_desc;
+
+    ball_pipeline_ = SDL_CreateGPUGraphicsPipeline(device, &ball_pipe_info);
+
+    SDL_ReleaseGPUShader(device, ball_vert);
+    SDL_ReleaseGPUShader(device, ball_frag);
+
+    if (!ball_pipeline_) {
+        SDL_Log("GpuPipeline: ball instanced pipeline creation failed: %s", SDL_GetError());
+        return false;
+    }
+
    // ----------------------------------------------------------------
    // UI overlay pipeline (same as sprite but renders to swapchain format)
    // Reuse sprite shaders with different target format.
@@ -275,12 +396,13 @@ bool GpuPipeline::init(SDL_GPUDevice* device,
        return false;
    }

-    SDL_Log("GpuPipeline: sprite and postfx pipelines created successfully");
+    SDL_Log("GpuPipeline: all pipelines created successfully");
    return true;
 }

 void GpuPipeline::destroy(SDL_GPUDevice* device) {
    if (sprite_pipeline_) { SDL_ReleaseGPUGraphicsPipeline(device, sprite_pipeline_); sprite_pipeline_ = nullptr; }
+    if (ball_pipeline_)   { SDL_ReleaseGPUGraphicsPipeline(device, ball_pipeline_);   ball_pipeline_   = nullptr; }
    if (postfx_pipeline_) { SDL_ReleaseGPUGraphicsPipeline(device, postfx_pipeline_); postfx_pipeline_ = nullptr; }
 }

@@ -289,7 +411,8 @@ SDL_GPUShader* GpuPipeline::createShader(SDL_GPUDevice* device,
                                          const char* entrypoint,
                                          SDL_GPUShaderStage stage,
                                          Uint32 num_samplers,
-                                          Uint32 num_uniform_buffers) {
+                                          Uint32 num_uniform_buffers,
+                                          Uint32 num_storage_buffers) {
    SDL_GPUShaderCreateInfo info = {};
    info.code                = reinterpret_cast<const Uint8*>(msl_source);
    info.code_size           = static_cast<size_t>(strlen(msl_source) + 1);
@@ -298,7 +421,7 @@ SDL_GPUShader* GpuPipeline::createShader(SDL_GPUDevice* device,
    info.stage               = stage;
    info.num_samplers        = num_samplers;
    info.num_storage_textures = 0;
-    info.num_storage_buffers  = 0;
+    info.num_storage_buffers  = num_storage_buffers;
    info.num_uniform_buffers  = num_uniform_buffers;

    SDL_GPUShader* shader = SDL_CreateGPUShader(device, &info);
--- a/source/gpu/gpu_pipeline.hpp
+++ b/source/gpu/gpu_pipeline.hpp
@@ -19,6 +19,9 @@ struct PostFXUniforms {
 //
 //  sprite_pipeline_ : textured quads, alpha blending.
 //                     Vertex layout: GpuVertex (pos float2, uv float2, col float4).
+//  ball_pipeline_   : instanced ball rendering, alpha blending.
+//                     Vertex layout: BallGPUData as per-instance data (input_rate=INSTANCE).
+//                     6 procedural vertices per instance (no index buffer).
 //  postfx_pipeline_ : full-screen triangle, no vertex buffer, no blend.
 //                     Reads offscreen texture, writes to swapchain.
 //                     Accepts PostFXUniforms via fragment uniform buffer slot 0.
@@ -33,7 +36,8 @@ public:
    void destroy(SDL_GPUDevice* device);

    SDL_GPUGraphicsPipeline* spritePipeline() const { return sprite_pipeline_; }
-    SDL_GPUGraphicsPipeline* postfxPipeline()  const { return postfx_pipeline_; }
+    SDL_GPUGraphicsPipeline* ballPipeline()   const { return ball_pipeline_; }
+    SDL_GPUGraphicsPipeline* postfxPipeline() const { return postfx_pipeline_; }

 private:
    SDL_GPUShader* createShader(SDL_GPUDevice* device,
@@ -41,8 +45,10 @@ private:
                                const char* entrypoint,
                                SDL_GPUShaderStage stage,
                                Uint32 num_samplers,
-                                Uint32 num_uniform_buffers);
+                                Uint32 num_uniform_buffers,
+                                Uint32 num_storage_buffers = 0);

    SDL_GPUGraphicsPipeline* sprite_pipeline_ = nullptr;
+    SDL_GPUGraphicsPipeline* ball_pipeline_   = nullptr;
    SDL_GPUGraphicsPipeline* postfx_pipeline_ = nullptr;
 };