refactor(gpu): eliminar GPU compute boids (prevé crash macOS)

Elimina el kernel Metal O(N²) de boids en GPU que causava GPU timeout
a macOS amb >50K boles, arrossegant WindowServer fins al crash.

- Elimina gpu_boid_buffer.hpp/cpp (GpuBoidBuffer, BallComputeData, BoidParams)
- Elimina kBoidComputeMSL i kBallComputeVertMSL de gpu_pipeline
- Elimina boid_compute_pipeline_ i ball_compute_pipeline_
- Elimina use_gpu_boids_, boid_params_, ball_screen_uniforms_ de Engine
- Elimina syncAndExitGpuBoids() i tot el compute dispatch de render()
- Mode BOIDS ara usa sempre boid_manager_ (CPU, spatial hash O(N))
  i renderitza via gpu_ball_buffer_ instanced (mateix path que PHYSICS)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-20 08:45:01 +01:00
parent badf92420b
commit d2e7f2ff86
8 changed files with 321 additions and 41 deletions

View File

@@ -0,0 +1,65 @@
#include "gpu_ball_buffer.hpp"
#include <SDL3/SDL_log.h>
#include <algorithm> // std::min
#include <cstring> // memcpy
bool GpuBallBuffer::init(SDL_GPUDevice* device) {
Uint32 buf_size = static_cast<Uint32>(MAX_BALLS) * sizeof(BallGPUData);
// GPU vertex buffer (instance-rate data read by the ball instanced shader)
SDL_GPUBufferCreateInfo buf_info = {};
buf_info.usage = SDL_GPU_BUFFERUSAGE_VERTEX;
buf_info.size = buf_size;
gpu_buf_ = SDL_CreateGPUBuffer(device, &buf_info);
if (!gpu_buf_) {
SDL_Log("GpuBallBuffer: GPU buffer creation failed: %s", SDL_GetError());
return false;
}
// Transfer buffer (upload staging, cycled every frame)
SDL_GPUTransferBufferCreateInfo tb_info = {};
tb_info.usage = SDL_GPU_TRANSFERBUFFERUSAGE_UPLOAD;
tb_info.size = buf_size;
transfer_buf_ = SDL_CreateGPUTransferBuffer(device, &tb_info);
if (!transfer_buf_) {
SDL_Log("GpuBallBuffer: transfer buffer creation failed: %s", SDL_GetError());
return false;
}
SDL_Log("GpuBallBuffer: initialized (capacity %d balls, %.1f MB VRAM)",
MAX_BALLS, buf_size / (1024.0f * 1024.0f));
return true;
}
void GpuBallBuffer::destroy(SDL_GPUDevice* device) {
if (!device) return;
if (transfer_buf_) { SDL_ReleaseGPUTransferBuffer(device, transfer_buf_); transfer_buf_ = nullptr; }
if (gpu_buf_) { SDL_ReleaseGPUBuffer(device, gpu_buf_); gpu_buf_ = nullptr; }
count_ = 0;
}
bool GpuBallBuffer::upload(SDL_GPUDevice* device, SDL_GPUCommandBuffer* cmd,
const BallGPUData* data, int count) {
if (!data || count <= 0) { count_ = 0; return false; }
count = std::min(count, MAX_BALLS);
Uint32 upload_size = static_cast<Uint32>(count) * sizeof(BallGPUData);
void* ptr = SDL_MapGPUTransferBuffer(device, transfer_buf_, true /* cycle */);
if (!ptr) {
SDL_Log("GpuBallBuffer: transfer buffer map failed: %s", SDL_GetError());
return false;
}
memcpy(ptr, data, upload_size);
SDL_UnmapGPUTransferBuffer(device, transfer_buf_);
SDL_GPUCopyPass* copy = SDL_BeginGPUCopyPass(cmd);
SDL_GPUTransferBufferLocation src = { transfer_buf_, 0 };
SDL_GPUBufferRegion dst = { gpu_buf_, 0, upload_size };
SDL_UploadToGPUBuffer(copy, &src, &dst, true /* cycle */);
SDL_EndGPUCopyPass(copy);
count_ = count;
return true;
}

View File

@@ -0,0 +1,47 @@
#pragma once
#include <SDL3/SDL_gpu.h>
#include <cstdint>
// ---------------------------------------------------------------------------
// BallGPUData — 32-byte per-instance record stored in VRAM.
// Positions and sizes pre-converted to NDC space on CPU so the vertex shader
// needs no screen-dimension uniform.
// cx, cy : NDC center (cx = (x + w/2)/sw*2-1, cy = 1-(y+h/2)/sh*2)
// hw, hh : NDC half-size (hw = w/sw, hh = h/sh, both positive)
// r,g,b,a: RGBA in [0,1]
// ---------------------------------------------------------------------------
struct BallGPUData {
float cx, cy; // NDC center
float hw, hh; // NDC half-size (positive)
float r, g, b, a; // RGBA color [0,1]
};
static_assert(sizeof(BallGPUData) == 32, "BallGPUData must be 32 bytes");
// ============================================================================
// GpuBallBuffer — owns the GPU vertex buffer used for instanced ball rendering.
//
// Usage per frame:
// buffer.upload(device, cmd, data, count); // inside a copy pass
// // Then in render pass: bind buffer, SDL_DrawGPUPrimitives(pass, 6, count, 0, 0)
// ============================================================================
class GpuBallBuffer {
public:
static constexpr int MAX_BALLS = 500000;
bool init(SDL_GPUDevice* device);
void destroy(SDL_GPUDevice* device);
// Upload ball array to GPU via an internal copy pass.
// count is clamped to MAX_BALLS. Returns false on error or empty input.
bool upload(SDL_GPUDevice* device, SDL_GPUCommandBuffer* cmd,
const BallGPUData* data, int count);
SDL_GPUBuffer* buffer() const { return gpu_buf_; }
int count() const { return count_; }
private:
SDL_GPUBuffer* gpu_buf_ = nullptr;
SDL_GPUTransferBuffer* transfer_buf_ = nullptr;
int count_ = 0;
};

View File

@@ -1,8 +1,10 @@
#include "gpu_pipeline.hpp"
#include "gpu_sprite_batch.hpp" // for GpuVertex layout
#include "gpu_sprite_batch.hpp" // for GpuVertex layout
#include "gpu_ball_buffer.hpp" // for BallGPUData layout
#include <SDL3/SDL_log.h>
#include <cstddef> // offsetof
#include <cstring> // strlen
// ============================================================================
// MSL Shaders (Metal Shading Language, macOS)
@@ -133,6 +135,60 @@ fragment float4 postfx_fs(PostVOut in [[stage_in]],
}
)";
// ---------------------------------------------------------------------------
// Ball instanced vertex shader
// Reads BallGPUData as per-instance attributes (input_rate = INSTANCE).
// Generates a 6-vertex quad (2 triangles) per instance using vertex_id.
//
// BallGPUData layout:
// float2 center [[attribute(0)]] — NDC center (cx, cy)
// float2 half [[attribute(1)]] — NDC half-size (hw, hh), both positive
// float4 col [[attribute(2)]] — RGBA [0,1]
//
// NDC convention (SDL / Metal): Y increases upward (+1=top, -1=bottom).
// half.x = w/screen_w, half.y = h/screen_h (positive; Y is not flipped)
// Vertex order: TL TR BL | TR BR BL (CCW winding, standard Metal)
// ---------------------------------------------------------------------------
static const char* kBallInstancedVertMSL = R"(
#include <metal_stdlib>
using namespace metal;
struct BallInstance {
float2 center [[attribute(0)]]; // NDC center
float2 halfsize [[attribute(1)]]; // NDC half-size (both positive); 'half' is reserved in MSL
float4 col [[attribute(2)]];
};
struct BallVOut {
float4 pos [[position]];
float2 uv;
float4 col;
};
vertex BallVOut ball_instanced_vs(BallInstance inst [[stage_in]],
uint vid [[vertex_id]]) {
// Offset signs for each of the 6 vertices (TL TR BL | TR BR BL)
const float2 offsets[6] = {
{-1.0f, 1.0f}, // TL
{ 1.0f, 1.0f}, // TR
{-1.0f, -1.0f}, // BL
{ 1.0f, 1.0f}, // TR (shared)
{ 1.0f, -1.0f}, // BR
{-1.0f, -1.0f}, // BL (shared)
};
// UV: TL=(0,0) TR=(1,0) BL=(0,1) BR=(1,1)
const float2 uvs[6] = {
{0.0f, 0.0f}, {1.0f, 0.0f}, {0.0f, 1.0f},
{1.0f, 0.0f}, {1.0f, 1.0f}, {0.0f, 1.0f},
};
float2 pos = inst.center + offsets[vid] * inst.halfsize;
BallVOut out;
out.pos = float4(pos.x, pos.y, 0.0f, 1.0f);
out.uv = uvs[vid];
out.col = inst.col;
return out;
}
)";
// ============================================================================
// GpuPipeline implementation
// ============================================================================
@@ -222,6 +278,71 @@ bool GpuPipeline::init(SDL_GPUDevice* device,
return false;
}
// ----------------------------------------------------------------
// Ball instanced pipeline
// Vertex: ball_instanced_vs (BallGPUData per-instance, no index buffer)
// Fragment: sprite_fs (same texture+color blend as sprite pipeline)
// Targets: offscreen (same as sprite pipeline)
// ----------------------------------------------------------------
SDL_GPUShader* ball_vert = createShader(device, kBallInstancedVertMSL, "ball_instanced_vs",
SDL_GPU_SHADERSTAGE_VERTEX, 0, 0);
SDL_GPUShader* ball_frag = createShader(device, kSpriteFragMSL, "sprite_fs",
SDL_GPU_SHADERSTAGE_FRAGMENT, 1, 0);
if (!ball_vert || !ball_frag) {
SDL_Log("GpuPipeline: failed to create ball instanced shaders");
if (ball_vert) SDL_ReleaseGPUShader(device, ball_vert);
if (ball_frag) SDL_ReleaseGPUShader(device, ball_frag);
return false;
}
// Vertex input: BallGPUData as per-instance data (step rate = 1 instance)
SDL_GPUVertexBufferDescription ball_vb_desc = {};
ball_vb_desc.slot = 0;
ball_vb_desc.pitch = sizeof(BallGPUData);
ball_vb_desc.input_rate = SDL_GPU_VERTEXINPUTRATE_INSTANCE;
ball_vb_desc.instance_step_rate = 1;
SDL_GPUVertexAttribute ball_attrs[3] = {};
// attr 0: center (float2) at offset 0
ball_attrs[0].location = 0;
ball_attrs[0].buffer_slot = 0;
ball_attrs[0].format = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT2;
ball_attrs[0].offset = static_cast<Uint32>(offsetof(BallGPUData, cx));
// attr 1: half-size (float2) at offset 8
ball_attrs[1].location = 1;
ball_attrs[1].buffer_slot = 0;
ball_attrs[1].format = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT2;
ball_attrs[1].offset = static_cast<Uint32>(offsetof(BallGPUData, hw));
// attr 2: color (float4) at offset 16
ball_attrs[2].location = 2;
ball_attrs[2].buffer_slot = 0;
ball_attrs[2].format = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT4;
ball_attrs[2].offset = static_cast<Uint32>(offsetof(BallGPUData, r));
SDL_GPUVertexInputState ball_vertex_input = {};
ball_vertex_input.vertex_buffer_descriptions = &ball_vb_desc;
ball_vertex_input.num_vertex_buffers = 1;
ball_vertex_input.vertex_attributes = ball_attrs;
ball_vertex_input.num_vertex_attributes = 3;
SDL_GPUGraphicsPipelineCreateInfo ball_pipe_info = {};
ball_pipe_info.vertex_shader = ball_vert;
ball_pipe_info.fragment_shader = ball_frag;
ball_pipe_info.vertex_input_state = ball_vertex_input;
ball_pipe_info.primitive_type = SDL_GPU_PRIMITIVETYPE_TRIANGLELIST;
ball_pipe_info.target_info.num_color_targets = 1;
ball_pipe_info.target_info.color_target_descriptions = &color_target_desc;
ball_pipeline_ = SDL_CreateGPUGraphicsPipeline(device, &ball_pipe_info);
SDL_ReleaseGPUShader(device, ball_vert);
SDL_ReleaseGPUShader(device, ball_frag);
if (!ball_pipeline_) {
SDL_Log("GpuPipeline: ball instanced pipeline creation failed: %s", SDL_GetError());
return false;
}
// ----------------------------------------------------------------
// UI overlay pipeline (same as sprite but renders to swapchain format)
// Reuse sprite shaders with different target format.
@@ -275,12 +396,13 @@ bool GpuPipeline::init(SDL_GPUDevice* device,
return false;
}
SDL_Log("GpuPipeline: sprite and postfx pipelines created successfully");
SDL_Log("GpuPipeline: all pipelines created successfully");
return true;
}
void GpuPipeline::destroy(SDL_GPUDevice* device) {
if (sprite_pipeline_) { SDL_ReleaseGPUGraphicsPipeline(device, sprite_pipeline_); sprite_pipeline_ = nullptr; }
if (ball_pipeline_) { SDL_ReleaseGPUGraphicsPipeline(device, ball_pipeline_); ball_pipeline_ = nullptr; }
if (postfx_pipeline_) { SDL_ReleaseGPUGraphicsPipeline(device, postfx_pipeline_); postfx_pipeline_ = nullptr; }
}
@@ -289,7 +411,8 @@ SDL_GPUShader* GpuPipeline::createShader(SDL_GPUDevice* device,
const char* entrypoint,
SDL_GPUShaderStage stage,
Uint32 num_samplers,
Uint32 num_uniform_buffers) {
Uint32 num_uniform_buffers,
Uint32 num_storage_buffers) {
SDL_GPUShaderCreateInfo info = {};
info.code = reinterpret_cast<const Uint8*>(msl_source);
info.code_size = static_cast<size_t>(strlen(msl_source) + 1);
@@ -298,7 +421,7 @@ SDL_GPUShader* GpuPipeline::createShader(SDL_GPUDevice* device,
info.stage = stage;
info.num_samplers = num_samplers;
info.num_storage_textures = 0;
info.num_storage_buffers = 0;
info.num_storage_buffers = num_storage_buffers;
info.num_uniform_buffers = num_uniform_buffers;
SDL_GPUShader* shader = SDL_CreateGPUShader(device, &info);

View File

@@ -19,6 +19,9 @@ struct PostFXUniforms {
//
// sprite_pipeline_ : textured quads, alpha blending.
// Vertex layout: GpuVertex (pos float2, uv float2, col float4).
// ball_pipeline_ : instanced ball rendering, alpha blending.
// Vertex layout: BallGPUData as per-instance data (input_rate=INSTANCE).
// 6 procedural vertices per instance (no index buffer).
// postfx_pipeline_ : full-screen triangle, no vertex buffer, no blend.
// Reads offscreen texture, writes to swapchain.
// Accepts PostFXUniforms via fragment uniform buffer slot 0.
@@ -33,7 +36,8 @@ public:
void destroy(SDL_GPUDevice* device);
SDL_GPUGraphicsPipeline* spritePipeline() const { return sprite_pipeline_; }
SDL_GPUGraphicsPipeline* postfxPipeline() const { return postfx_pipeline_; }
SDL_GPUGraphicsPipeline* ballPipeline() const { return ball_pipeline_; }
SDL_GPUGraphicsPipeline* postfxPipeline() const { return postfx_pipeline_; }
private:
SDL_GPUShader* createShader(SDL_GPUDevice* device,
@@ -41,8 +45,10 @@ private:
const char* entrypoint,
SDL_GPUShaderStage stage,
Uint32 num_samplers,
Uint32 num_uniform_buffers);
Uint32 num_uniform_buffers,
Uint32 num_storage_buffers = 0);
SDL_GPUGraphicsPipeline* sprite_pipeline_ = nullptr;
SDL_GPUGraphicsPipeline* ball_pipeline_ = nullptr;
SDL_GPUGraphicsPipeline* postfx_pipeline_ = nullptr;
};