refactor(gpu): eliminar GPU compute boids (prevé crash macOS)

Elimina el kernel Metal O(N²) de boids en GPU que causava GPU timeout a macOS amb >50K boles, arrossegant WindowServer fins al crash. - Elimina gpu_boid_buffer.hpp/cpp (GpuBoidBuffer, BallComputeData, BoidParams) - Elimina kBoidComputeMSL i kBallComputeVertMSL de gpu_pipeline - Elimina boid_compute_pipeline_ i ball_compute_pipeline_ - Elimina use_gpu_boids_, boid_params_, ball_screen_uniforms_ de Engine - Elimina syncAndExitGpuBoids() i tot el compute dispatch de render() - Mode BOIDS ara usa sempre boid_manager_ (CPU, spatial hash O(N)) i renderitza via gpu_ball_buffer_ instanced (mateix path que PHYSICS) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-20 08:45:01 +01:00
parent badf92420b
commit d2e7f2ff86
8 changed files with 321 additions and 41 deletions
--- a/source/gpu/gpu_pipeline.cpp
+++ b/source/gpu/gpu_pipeline.cpp
@@ -1,8 +1,10 @@
 #include "gpu_pipeline.hpp"
-#include "gpu_sprite_batch.hpp"  // for GpuVertex layout
+#include "gpu_sprite_batch.hpp"   // for GpuVertex layout
+#include "gpu_ball_buffer.hpp"    // for BallGPUData layout

 #include <SDL3/SDL_log.h>
 #include <cstddef>  // offsetof
+#include <cstring>  // strlen

 // ============================================================================
 // MSL Shaders (Metal Shading Language, macOS)
@@ -133,6 +135,60 @@ fragment float4 postfx_fs(PostVOut                   in    [[stage_in]],
 }
 )";

+// ---------------------------------------------------------------------------
+// Ball instanced vertex shader
+// Reads BallGPUData as per-instance attributes (input_rate = INSTANCE).
+// Generates a 6-vertex quad (2 triangles) per instance using vertex_id.
+//
+// BallGPUData layout:
+//   float2 center [[attribute(0)]]  — NDC center  (cx, cy)
+//   float2 half   [[attribute(1)]]  — NDC half-size (hw, hh), both positive
+//   float4 col    [[attribute(2)]]  — RGBA [0,1]
+//
+// NDC convention (SDL / Metal): Y increases upward (+1=top, -1=bottom).
+// half.x = w/screen_w,  half.y = h/screen_h  (positive; Y is not flipped)
+// Vertex order: TL TR BL | TR BR BL  (CCW winding, standard Metal)
+// ---------------------------------------------------------------------------
+static const char* kBallInstancedVertMSL = R"(
+#include <metal_stdlib>
+using namespace metal;
+
+struct BallInstance {
+    float2 center   [[attribute(0)]];  // NDC center
+    float2 halfsize [[attribute(1)]];  // NDC half-size (both positive); 'half' is reserved in MSL
+    float4 col      [[attribute(2)]];
+};
+struct BallVOut {
+    float4 pos [[position]];
+    float2 uv;
+    float4 col;
+};
+
+vertex BallVOut ball_instanced_vs(BallInstance inst [[stage_in]],
+                                   uint vid [[vertex_id]]) {
+    // Offset signs for each of the 6 vertices (TL TR BL | TR BR BL)
+    const float2 offsets[6] = {
+        {-1.0f,  1.0f},  // TL
+        { 1.0f,  1.0f},  // TR
+        {-1.0f, -1.0f},  // BL
+        { 1.0f,  1.0f},  // TR (shared)
+        { 1.0f, -1.0f},  // BR
+        {-1.0f, -1.0f},  // BL (shared)
+    };
+    // UV: TL=(0,0) TR=(1,0) BL=(0,1) BR=(1,1)
+    const float2 uvs[6] = {
+        {0.0f, 0.0f}, {1.0f, 0.0f}, {0.0f, 1.0f},
+        {1.0f, 0.0f}, {1.0f, 1.0f}, {0.0f, 1.0f},
+    };
+    float2 pos = inst.center + offsets[vid] * inst.halfsize;
+    BallVOut out;
+    out.pos = float4(pos.x, pos.y, 0.0f, 1.0f);
+    out.uv  = uvs[vid];
+    out.col = inst.col;
+    return out;
+}
+)";
+
 // ============================================================================
 // GpuPipeline implementation
 // ============================================================================
@@ -222,6 +278,71 @@ bool GpuPipeline::init(SDL_GPUDevice* device,
        return false;
    }

+    // ----------------------------------------------------------------
+    // Ball instanced pipeline
+    // Vertex: ball_instanced_vs (BallGPUData per-instance, no index buffer)
+    // Fragment: sprite_fs (same texture+color blend as sprite pipeline)
+    // Targets: offscreen (same as sprite pipeline)
+    // ----------------------------------------------------------------
+    SDL_GPUShader* ball_vert = createShader(device, kBallInstancedVertMSL, "ball_instanced_vs",
+                                             SDL_GPU_SHADERSTAGE_VERTEX, 0, 0);
+    SDL_GPUShader* ball_frag = createShader(device, kSpriteFragMSL, "sprite_fs",
+                                             SDL_GPU_SHADERSTAGE_FRAGMENT, 1, 0);
+    if (!ball_vert || !ball_frag) {
+        SDL_Log("GpuPipeline: failed to create ball instanced shaders");
+        if (ball_vert) SDL_ReleaseGPUShader(device, ball_vert);
+        if (ball_frag) SDL_ReleaseGPUShader(device, ball_frag);
+        return false;
+    }
+
+    // Vertex input: BallGPUData as per-instance data (step rate = 1 instance)
+    SDL_GPUVertexBufferDescription ball_vb_desc = {};
+    ball_vb_desc.slot               = 0;
+    ball_vb_desc.pitch              = sizeof(BallGPUData);
+    ball_vb_desc.input_rate         = SDL_GPU_VERTEXINPUTRATE_INSTANCE;
+    ball_vb_desc.instance_step_rate = 1;
+
+    SDL_GPUVertexAttribute ball_attrs[3] = {};
+    // attr 0: center (float2) at offset 0
+    ball_attrs[0].location    = 0;
+    ball_attrs[0].buffer_slot = 0;
+    ball_attrs[0].format      = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT2;
+    ball_attrs[0].offset      = static_cast<Uint32>(offsetof(BallGPUData, cx));
+    // attr 1: half-size (float2) at offset 8
+    ball_attrs[1].location    = 1;
+    ball_attrs[1].buffer_slot = 0;
+    ball_attrs[1].format      = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT2;
+    ball_attrs[1].offset      = static_cast<Uint32>(offsetof(BallGPUData, hw));
+    // attr 2: color (float4) at offset 16
+    ball_attrs[2].location    = 2;
+    ball_attrs[2].buffer_slot = 0;
+    ball_attrs[2].format      = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT4;
+    ball_attrs[2].offset      = static_cast<Uint32>(offsetof(BallGPUData, r));
+
+    SDL_GPUVertexInputState ball_vertex_input = {};
+    ball_vertex_input.vertex_buffer_descriptions = &ball_vb_desc;
+    ball_vertex_input.num_vertex_buffers         = 1;
+    ball_vertex_input.vertex_attributes          = ball_attrs;
+    ball_vertex_input.num_vertex_attributes      = 3;
+
+    SDL_GPUGraphicsPipelineCreateInfo ball_pipe_info = {};
+    ball_pipe_info.vertex_shader      = ball_vert;
+    ball_pipe_info.fragment_shader    = ball_frag;
+    ball_pipe_info.vertex_input_state = ball_vertex_input;
+    ball_pipe_info.primitive_type     = SDL_GPU_PRIMITIVETYPE_TRIANGLELIST;
+    ball_pipe_info.target_info.num_color_targets         = 1;
+    ball_pipe_info.target_info.color_target_descriptions = &color_target_desc;
+
+    ball_pipeline_ = SDL_CreateGPUGraphicsPipeline(device, &ball_pipe_info);
+
+    SDL_ReleaseGPUShader(device, ball_vert);
+    SDL_ReleaseGPUShader(device, ball_frag);
+
+    if (!ball_pipeline_) {
+        SDL_Log("GpuPipeline: ball instanced pipeline creation failed: %s", SDL_GetError());
+        return false;
+    }
+
    // ----------------------------------------------------------------
    // UI overlay pipeline (same as sprite but renders to swapchain format)
    // Reuse sprite shaders with different target format.
@@ -275,12 +396,13 @@ bool GpuPipeline::init(SDL_GPUDevice* device,
        return false;
    }

-    SDL_Log("GpuPipeline: sprite and postfx pipelines created successfully");
+    SDL_Log("GpuPipeline: all pipelines created successfully");
    return true;
 }

 void GpuPipeline::destroy(SDL_GPUDevice* device) {
    if (sprite_pipeline_) { SDL_ReleaseGPUGraphicsPipeline(device, sprite_pipeline_); sprite_pipeline_ = nullptr; }
+    if (ball_pipeline_)   { SDL_ReleaseGPUGraphicsPipeline(device, ball_pipeline_);   ball_pipeline_   = nullptr; }
    if (postfx_pipeline_) { SDL_ReleaseGPUGraphicsPipeline(device, postfx_pipeline_); postfx_pipeline_ = nullptr; }
 }

@@ -289,7 +411,8 @@ SDL_GPUShader* GpuPipeline::createShader(SDL_GPUDevice* device,
                                          const char* entrypoint,
                                          SDL_GPUShaderStage stage,
                                          Uint32 num_samplers,
-                                          Uint32 num_uniform_buffers) {
+                                          Uint32 num_uniform_buffers,
+                                          Uint32 num_storage_buffers) {
    SDL_GPUShaderCreateInfo info = {};
    info.code                = reinterpret_cast<const Uint8*>(msl_source);
    info.code_size           = static_cast<size_t>(strlen(msl_source) + 1);
@@ -298,7 +421,7 @@ SDL_GPUShader* GpuPipeline::createShader(SDL_GPUDevice* device,
    info.stage               = stage;
    info.num_samplers        = num_samplers;
    info.num_storage_textures = 0;
-    info.num_storage_buffers  = 0;
+    info.num_storage_buffers  = num_storage_buffers;
    info.num_uniform_buffers  = num_uniform_buffers;

    SDL_GPUShader* shader = SDL_CreateGPUShader(device, &info);