diff --git a/source/defines.hpp b/source/defines.hpp
index e03c0e8..a0809cb 100644
--- a/source/defines.hpp
+++ b/source/defines.hpp
@@ -51,7 +51,8 @@ constexpr float GRAVITY_CHANGE_LATERAL_MAX = 0.08f;  // Velocidad lateral máxim
 constexpr float BALL_SPAWN_MARGIN = 0.15f;  // Margen lateral para spawn (0.25 = 25% a cada lado)
 
 // Escenarios de número de pelotas (teclas 1-8)
-constexpr int BALL_COUNT_SCENARIOS[8] = {10, 50, 100, 500, 1000, 5000, 10000, 50000};
+// Fase 1 (instanced rendering): límit pràctic ~100K a 60fps (physics bound)
+constexpr int BALL_COUNT_SCENARIOS[8] = {10, 50, 100, 500, 1000, 5000, 10000, 100000};
 
 // Límites de escenario para modos automáticos (índices en BALL_COUNT_SCENARIOS)
 // BALL_COUNT_SCENARIOS = {10, 50, 100, 500, 1000, 5000, 10000, 50000}
diff --git a/source/engine.cpp b/source/engine.cpp
index aca76a0..aa3605c 100644
--- a/source/engine.cpp
+++ b/source/engine.cpp
@@ -237,6 +237,13 @@ bool Engine::initialize(int width, int height, int zoom, bool fullscreen, AppMod
                 success = false;
             }
 
+            gpu_ball_buffer_ = std::make_unique<GpuBallBuffer>();
+            if (!gpu_ball_buffer_->init(gpu_ctx_->device())) {
+                std::cerr << "ERROR: No se pudo crear el ball buffer GPU" << std::endl;
+                success = false;
+            }
+            ball_gpu_data_.reserve(GpuBallBuffer::MAX_BALLS);
+
             offscreen_tex_ = std::make_unique<GpuTexture>();
             if (!offscreen_tex_->createRenderTarget(gpu_ctx_->device(),
                                                     current_screen_width_, current_screen_height_,
@@ -377,8 +384,9 @@ void Engine::shutdown() {
         if (ui_tex_)        { ui_tex_->destroy(gpu_ctx_->device());        ui_tex_.reset(); }
         if (white_tex_)     { white_tex_->destroy(gpu_ctx_->device());     white_tex_.reset(); }
         if (offscreen_tex_) { offscreen_tex_->destroy(gpu_ctx_->device()); offscreen_tex_.reset(); }
-        if (sprite_batch_)  { sprite_batch_->destroy(gpu_ctx_->device());  sprite_batch_.reset(); }
-        if (gpu_pipeline_)  { gpu_pipeline_->destroy(gpu_ctx_->device());  gpu_pipeline_.reset(); }
+        if (sprite_batch_)    { sprite_batch_->destroy(gpu_ctx_->device());    sprite_batch_.reset(); }
+        if (gpu_ball_buffer_) { gpu_ball_buffer_->destroy(gpu_ctx_->device()); gpu_ball_buffer_.reset(); }
+        if (gpu_pipeline_)    { gpu_pipeline_->destroy(gpu_ctx_->device());    gpu_pipeline_.reset(); }
     }
 
     // Destroy software UI renderer and surface
@@ -437,7 +445,7 @@ void Engine::update() {
         // Modo Figura 3D: actualizar figura polimórfica
         updateShape();
     } else if (current_mode_ == SimulationMode::BOIDS) {
-        // Modo Boids: actualizar comportamiento de enjambre (delegado a BoidManager)
+        // CPU boids: actualizar comportamiento de enjambre (delegado a BoidManager)
         boid_manager_->update(delta_time_);
     }
 
@@ -544,18 +552,17 @@ void Engine::toggleDepthZoom() {
 // Boids (comportamiento de enjambre)
 void Engine::toggleBoidsMode(bool force_gravity_on) {
     if (current_mode_ == SimulationMode::BOIDS) {
-        // Salir del modo boids (velocidades ya son time-based, no requiere conversión)
+        // Salir del modo boids
         current_mode_ = SimulationMode::PHYSICS;
-        boid_manager_->deactivateBoids(force_gravity_on);  // Pasar parámetro para control preciso
+        boid_manager_->deactivateBoids(force_gravity_on);
     } else {
         // Entrar al modo boids (desde PHYSICS o SHAPE)
         if (current_mode_ == SimulationMode::SHAPE) {
-            // Si estamos en modo shape, salir primero sin forzar gravedad
             shape_manager_->toggleShapeMode(false);
             current_mode_ = SimulationMode::PHYSICS;
         }
 
-        // Activar modo boids
+        // Activar modo boids en CPU (configura gravedad OFF, inicializa velocidades)
         current_mode_ = SimulationMode::BOIDS;
         boid_manager_->activateBoids();
     }
@@ -728,8 +735,12 @@ void Engine::render() {
 
     // Sprites (balls)
     const auto& balls = scene_manager_->getBalls();
+    const float sw = static_cast<float>(current_screen_width_);
+    const float sh = static_cast<float>(current_screen_height_);
+
     if (current_mode_ == SimulationMode::SHAPE) {
-        // Bucket sort by depth Z (Painter's Algorithm)
+        // SHAPE mode: bucket sort by depth Z (Painter's Algorithm), with depth scale.
+        // Uses the sprite batch (supports per-sprite scale, needed for depth zoom).
         for (size_t i = 0; i < balls.size(); i++) {
             int b = static_cast<int>(balls[i]->getDepthBrightness() * (DEPTH_SORT_BUCKETS - 1));
             depth_buckets_[std::clamp(b, 0, DEPTH_SORT_BUCKETS - 1)].push_back(i);
@@ -745,39 +756,48 @@ void Engine::render() {
                                          color.r / 255.0f * bf,
                                          color.g / 255.0f * bf,
                                          color.b / 255.0f * bf,
-                                         1.0f, depth_scale,
-                                         static_cast<float>(current_screen_width_),
-                                         static_cast<float>(current_screen_height_));
+                                         1.0f, depth_scale, sw, sh);
             }
             depth_buckets_[b].clear();
         }
     } else {
-        size_t idx = 0;
-        for (const auto& ball : balls) {
-            SDL_FRect pos = ball->getPosition();
+        // PHYSICS / CPU-BOIDS mode: build instanced ball buffer (GPU instanced rendering).
+        // 32 bytes per ball instead of 4×32 bytes per quad — 4× less upload bandwidth.
+        ball_gpu_data_.clear();
+        for (size_t idx = 0; idx < balls.size(); idx++) {
+            SDL_FRect pos = balls[idx]->getPosition();
             Color color = theme_manager_->getInterpolatedColor(idx);
-            sprite_batch_->addSprite(pos.x, pos.y, pos.w, pos.h,
-                                     color.r / 255.0f, color.g / 255.0f, color.b / 255.0f,
-                                     1.0f, 1.0f,
-                                     static_cast<float>(current_screen_width_),
-                                     static_cast<float>(current_screen_height_));
-            idx++;
+            // Convert to NDC center + NDC half-size (both positive)
+            float cx = ((pos.x + pos.w * 0.5f) / sw) * 2.0f - 1.0f;
+            float cy = 1.0f - ((pos.y + pos.h * 0.5f) / sh) * 2.0f;
+            float hw = pos.w / sw;
+            float hh = pos.h / sh;
+            ball_gpu_data_.push_back({cx, cy, hw, hh,
+                                      color.r / 255.0f, color.g / 255.0f,
+                                      color.b / 255.0f, 1.0f});
         }
     }
 
     // UI overlay quad (drawn in Pass 2 over the postfx output)
     sprite_batch_->addFullscreenOverlay();
 
-    // Upload batch to GPU buffers
+    // Upload sprite batch (background + SHAPE balls + UI overlay quad)
     if (!sprite_batch_->uploadBatch(gpu_ctx_->device(), cmd)) {
         gpu_ctx_->submit(cmd);
         return;
     }
 
+    // Upload instanced ball buffer (PHYSICS / CPU-BOIDS modes)
+    bool use_instanced_balls = (current_mode_ != SimulationMode::SHAPE) && !ball_gpu_data_.empty();
+    if (use_instanced_balls) {
+        gpu_ball_buffer_->upload(gpu_ctx_->device(), cmd,
+                                  ball_gpu_data_.data(), static_cast<int>(ball_gpu_data_.size()));
+    }
+
     GpuTexture* sprite_tex = (!gpu_textures_.empty())
         ? gpu_textures_[current_texture_index_].get() : nullptr;
 
-    // === Pass 1: Render background + sprites to offscreen texture ===
+    // === Pass 1: Render background + balls to offscreen texture ===
     if (offscreen_tex_ && offscreen_tex_->isValid() && sprite_tex && sprite_tex->isValid()) {
         SDL_GPUColorTargetInfo ct = {};
         ct.texture     = offscreen_tex_->texture();
@@ -786,22 +806,36 @@ void Engine::render() {
         ct.store_op    = SDL_GPU_STOREOP_STORE;
 
         SDL_GPURenderPass* pass1 = SDL_BeginGPURenderPass(cmd, &ct, 1, nullptr);
+
+        // Background (white texture tinted by vertex color, via sprite batch)
         SDL_BindGPUGraphicsPipeline(pass1, gpu_pipeline_->spritePipeline());
-
-        SDL_GPUBufferBinding vb = {sprite_batch_->vertexBuffer(), 0};
-        SDL_GPUBufferBinding ib = {sprite_batch_->indexBuffer(), 0};
-        SDL_BindGPUVertexBuffers(pass1, 0, &vb, 1);
-        SDL_BindGPUIndexBuffer(pass1, &ib, SDL_GPU_INDEXELEMENTSIZE_32BIT);
-
-        // Background (white texture tinted by vertex color)
+        {
+            SDL_GPUBufferBinding vb = {sprite_batch_->vertexBuffer(), 0};
+            SDL_GPUBufferBinding ib = {sprite_batch_->indexBuffer(), 0};
+            SDL_BindGPUVertexBuffers(pass1, 0, &vb, 1);
+            SDL_BindGPUIndexBuffer(pass1, &ib, SDL_GPU_INDEXELEMENTSIZE_32BIT);
+        }
         if (white_tex_ && white_tex_->isValid() && sprite_batch_->bgIndexCount() > 0) {
             SDL_GPUTextureSamplerBinding tsb = {white_tex_->texture(), white_tex_->sampler()};
             SDL_BindGPUFragmentSamplers(pass1, 0, &tsb, 1);
             SDL_DrawGPUIndexedPrimitives(pass1, sprite_batch_->bgIndexCount(), 1, 0, 0, 0);
         }
 
-        // Sprites
-        if (sprite_batch_->spriteIndexCount() > 0) {
+        if (use_instanced_balls && gpu_ball_buffer_->count() > 0) {
+            // PHYSICS / CPU-BOIDS: instanced rendering — 6 procedural vertices per instance
+            SDL_BindGPUGraphicsPipeline(pass1, gpu_pipeline_->ballPipeline());
+            SDL_GPUBufferBinding ball_vb = {gpu_ball_buffer_->buffer(), 0};
+            SDL_BindGPUVertexBuffers(pass1, 0, &ball_vb, 1);
+            SDL_GPUTextureSamplerBinding tsb = {sprite_tex->texture(), sprite_tex->sampler()};
+            SDL_BindGPUFragmentSamplers(pass1, 0, &tsb, 1);
+            SDL_DrawGPUPrimitives(pass1, 6, static_cast<Uint32>(gpu_ball_buffer_->count()), 0, 0);
+        } else if (!use_instanced_balls && sprite_batch_->spriteIndexCount() > 0) {
+            // SHAPE: sprite batch with depth sort (re-bind sprite pipeline + buffers)
+            SDL_BindGPUGraphicsPipeline(pass1, gpu_pipeline_->spritePipeline());
+            SDL_GPUBufferBinding vb = {sprite_batch_->vertexBuffer(), 0};
+            SDL_GPUBufferBinding ib = {sprite_batch_->indexBuffer(), 0};
+            SDL_BindGPUVertexBuffers(pass1, 0, &vb, 1);
+            SDL_BindGPUIndexBuffer(pass1, &ib, SDL_GPU_INDEXELEMENTSIZE_32BIT);
             SDL_GPUTextureSamplerBinding tsb = {sprite_tex->texture(), sprite_tex->sampler()};
             SDL_BindGPUFragmentSamplers(pass1, 0, &tsb, 1);
             SDL_DrawGPUIndexedPrimitives(pass1, sprite_batch_->spriteIndexCount(), 1,
diff --git a/source/engine.hpp b/source/engine.hpp
index 97a3afe..1ef68e9 100644
--- a/source/engine.hpp
+++ b/source/engine.hpp
@@ -16,6 +16,7 @@
 #include "boids_mgr/boid_manager.hpp"   // for BoidManager
 #include "defines.hpp"                  // for GravityDirection, ColorTheme, ShapeType
 #include "external/texture.hpp"         // for Texture
+#include "gpu/gpu_ball_buffer.hpp"      // for GpuBallBuffer, BallGPUData
 #include "gpu/gpu_context.hpp"          // for GpuContext
 #include "gpu/gpu_pipeline.hpp"         // for GpuPipeline
 #include "gpu/gpu_sprite_batch.hpp"     // for GpuSpriteBatch
@@ -137,8 +138,10 @@ class Engine {
 
         // === SDL_GPU rendering pipeline ===
         std::unique_ptr<GpuContext>      gpu_ctx_;          // Device + swapchain
-        std::unique_ptr<GpuPipeline>     gpu_pipeline_;     // Sprite + postfx pipelines
-        std::unique_ptr<GpuSpriteBatch>  sprite_batch_;     // Per-frame vertex/index batch
+        std::unique_ptr<GpuPipeline>     gpu_pipeline_;     // Sprite + ball + postfx pipelines
+        std::unique_ptr<GpuSpriteBatch>  sprite_batch_;     // Per-frame vertex/index batch (bg + shape + UI)
+        std::unique_ptr<GpuBallBuffer>   gpu_ball_buffer_;  // Instanced ball instance data (PHYSICS/BOIDS)
+        std::vector<BallGPUData>         ball_gpu_data_;    // CPU-side staging vector (reused each frame)
         std::unique_ptr<GpuTexture>      offscreen_tex_;    // Offscreen render target (Pass 1)
         std::unique_ptr<GpuTexture>      white_tex_;        // 1×1 white (background gradient)
         std::unique_ptr<GpuTexture>      ui_tex_;           // UI text overlay texture
@@ -246,4 +249,5 @@ class Engine {
         void recreateOffscreenTexture();            // Recreate when resolution changes
         void renderUIToSurface();                   // Render text/UI to ui_surface_
         void uploadUISurface(SDL_GPUCommandBuffer* cmd_buf);  // Upload ui_surface_ → ui_tex_
+
 };
diff --git a/source/gpu/gpu_ball_buffer.cpp b/source/gpu/gpu_ball_buffer.cpp
new file mode 100644
index 0000000..4949b92
--- /dev/null
+++ b/source/gpu/gpu_ball_buffer.cpp
@@ -0,0 +1,65 @@
+#include "gpu_ball_buffer.hpp"
+
+#include <SDL3/SDL_log.h>
+#include <algorithm>  // std::min
+#include <cstring>    // memcpy
+
+bool GpuBallBuffer::init(SDL_GPUDevice* device) {
+    Uint32 buf_size = static_cast<Uint32>(MAX_BALLS) * sizeof(BallGPUData);
+
+    // GPU vertex buffer (instance-rate data read by the ball instanced shader)
+    SDL_GPUBufferCreateInfo buf_info = {};
+    buf_info.usage = SDL_GPU_BUFFERUSAGE_VERTEX;
+    buf_info.size  = buf_size;
+    gpu_buf_ = SDL_CreateGPUBuffer(device, &buf_info);
+    if (!gpu_buf_) {
+        SDL_Log("GpuBallBuffer: GPU buffer creation failed: %s", SDL_GetError());
+        return false;
+    }
+
+    // Transfer buffer (upload staging, cycled every frame)
+    SDL_GPUTransferBufferCreateInfo tb_info = {};
+    tb_info.usage = SDL_GPU_TRANSFERBUFFERUSAGE_UPLOAD;
+    tb_info.size  = buf_size;
+    transfer_buf_ = SDL_CreateGPUTransferBuffer(device, &tb_info);
+    if (!transfer_buf_) {
+        SDL_Log("GpuBallBuffer: transfer buffer creation failed: %s", SDL_GetError());
+        return false;
+    }
+
+    SDL_Log("GpuBallBuffer: initialized (capacity %d balls, %.1f MB VRAM)",
+            MAX_BALLS, buf_size / (1024.0f * 1024.0f));
+    return true;
+}
+
+void GpuBallBuffer::destroy(SDL_GPUDevice* device) {
+    if (!device) return;
+    if (transfer_buf_) { SDL_ReleaseGPUTransferBuffer(device, transfer_buf_); transfer_buf_ = nullptr; }
+    if (gpu_buf_)      { SDL_ReleaseGPUBuffer(device, gpu_buf_);              gpu_buf_      = nullptr; }
+    count_ = 0;
+}
+
+bool GpuBallBuffer::upload(SDL_GPUDevice* device, SDL_GPUCommandBuffer* cmd,
+                            const BallGPUData* data, int count) {
+    if (!data || count <= 0) { count_ = 0; return false; }
+    count = std::min(count, MAX_BALLS);
+
+    Uint32 upload_size = static_cast<Uint32>(count) * sizeof(BallGPUData);
+
+    void* ptr = SDL_MapGPUTransferBuffer(device, transfer_buf_, true /* cycle */);
+    if (!ptr) {
+        SDL_Log("GpuBallBuffer: transfer buffer map failed: %s", SDL_GetError());
+        return false;
+    }
+    memcpy(ptr, data, upload_size);
+    SDL_UnmapGPUTransferBuffer(device, transfer_buf_);
+
+    SDL_GPUCopyPass* copy = SDL_BeginGPUCopyPass(cmd);
+    SDL_GPUTransferBufferLocation src = { transfer_buf_, 0 };
+    SDL_GPUBufferRegion           dst = { gpu_buf_,      0, upload_size };
+    SDL_UploadToGPUBuffer(copy, &src, &dst, true /* cycle */);
+    SDL_EndGPUCopyPass(copy);
+
+    count_ = count;
+    return true;
+}
diff --git a/source/gpu/gpu_ball_buffer.hpp b/source/gpu/gpu_ball_buffer.hpp
new file mode 100644
index 0000000..07ed78f
--- /dev/null
+++ b/source/gpu/gpu_ball_buffer.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <SDL3/SDL_gpu.h>
+#include <cstdint>
+
+// ---------------------------------------------------------------------------
+// BallGPUData — 32-byte per-instance record stored in VRAM.
+// Positions and sizes pre-converted to NDC space on CPU so the vertex shader
+// needs no screen-dimension uniform.
+//   cx, cy : NDC center   (cx = (x + w/2)/sw*2-1,  cy = 1-(y+h/2)/sh*2)
+//   hw, hh : NDC half-size (hw = w/sw,  hh = h/sh, both positive)
+//   r,g,b,a: RGBA in [0,1]
+// ---------------------------------------------------------------------------
+struct BallGPUData {
+    float cx, cy;      // NDC center
+    float hw, hh;      // NDC half-size (positive)
+    float r, g, b, a;  // RGBA color [0,1]
+};
+static_assert(sizeof(BallGPUData) == 32, "BallGPUData must be 32 bytes");
+
+// ============================================================================
+// GpuBallBuffer — owns the GPU vertex buffer used for instanced ball rendering.
+//
+// Usage per frame:
+//   buffer.upload(device, cmd, data, count);  // inside a copy pass
+//   // Then in render pass: bind buffer, SDL_DrawGPUPrimitives(pass, 6, count, 0, 0)
+// ============================================================================
+class GpuBallBuffer {
+public:
+    static constexpr int MAX_BALLS = 500000;
+
+    bool init(SDL_GPUDevice* device);
+    void destroy(SDL_GPUDevice* device);
+
+    // Upload ball array to GPU via an internal copy pass.
+    // count is clamped to MAX_BALLS.  Returns false on error or empty input.
+    bool upload(SDL_GPUDevice* device, SDL_GPUCommandBuffer* cmd,
+                const BallGPUData* data, int count);
+
+    SDL_GPUBuffer* buffer() const { return gpu_buf_; }
+    int count() const { return count_; }
+
+private:
+    SDL_GPUBuffer*         gpu_buf_      = nullptr;
+    SDL_GPUTransferBuffer* transfer_buf_ = nullptr;
+    int                    count_        = 0;
+};
diff --git a/source/gpu/gpu_pipeline.cpp b/source/gpu/gpu_pipeline.cpp
index 1c91f6e..71295f4 100644
--- a/source/gpu/gpu_pipeline.cpp
+++ b/source/gpu/gpu_pipeline.cpp
@@ -1,8 +1,10 @@
 #include "gpu_pipeline.hpp"
-#include "gpu_sprite_batch.hpp"  // for GpuVertex layout
+#include "gpu_sprite_batch.hpp"   // for GpuVertex layout
+#include "gpu_ball_buffer.hpp"    // for BallGPUData layout
 
 #include <SDL3/SDL_log.h>
 #include <cstddef>  // offsetof
+#include <cstring>  // strlen
 
 // ============================================================================
 // MSL Shaders (Metal Shading Language, macOS)
@@ -133,6 +135,60 @@ fragment float4 postfx_fs(PostVOut                   in    [[stage_in]],
 }
 )";
 
+// ---------------------------------------------------------------------------
+// Ball instanced vertex shader
+// Reads BallGPUData as per-instance attributes (input_rate = INSTANCE).
+// Generates a 6-vertex quad (2 triangles) per instance using vertex_id.
+//
+// BallGPUData layout:
+//   float2 center [[attribute(0)]]  — NDC center  (cx, cy)
+//   float2 half   [[attribute(1)]]  — NDC half-size (hw, hh), both positive
+//   float4 col    [[attribute(2)]]  — RGBA [0,1]
+//
+// NDC convention (SDL / Metal): Y increases upward (+1=top, -1=bottom).
+// half.x = w/screen_w,  half.y = h/screen_h  (positive; Y is not flipped)
+// Vertex order: TL TR BL | TR BR BL  (CCW winding, standard Metal)
+// ---------------------------------------------------------------------------
+static const char* kBallInstancedVertMSL = R"(
+#include <metal_stdlib>
+using namespace metal;
+
+struct BallInstance {
+    float2 center   [[attribute(0)]];  // NDC center
+    float2 halfsize [[attribute(1)]];  // NDC half-size (both positive); 'half' is reserved in MSL
+    float4 col      [[attribute(2)]];
+};
+struct BallVOut {
+    float4 pos [[position]];
+    float2 uv;
+    float4 col;
+};
+
+vertex BallVOut ball_instanced_vs(BallInstance inst [[stage_in]],
+                                   uint vid [[vertex_id]]) {
+    // Offset signs for each of the 6 vertices (TL TR BL | TR BR BL)
+    const float2 offsets[6] = {
+        {-1.0f,  1.0f},  // TL
+        { 1.0f,  1.0f},  // TR
+        {-1.0f, -1.0f},  // BL
+        { 1.0f,  1.0f},  // TR (shared)
+        { 1.0f, -1.0f},  // BR
+        {-1.0f, -1.0f},  // BL (shared)
+    };
+    // UV: TL=(0,0) TR=(1,0) BL=(0,1) BR=(1,1)
+    const float2 uvs[6] = {
+        {0.0f, 0.0f}, {1.0f, 0.0f}, {0.0f, 1.0f},
+        {1.0f, 0.0f}, {1.0f, 1.0f}, {0.0f, 1.0f},
+    };
+    float2 pos = inst.center + offsets[vid] * inst.halfsize;
+    BallVOut out;
+    out.pos = float4(pos.x, pos.y, 0.0f, 1.0f);
+    out.uv  = uvs[vid];
+    out.col = inst.col;
+    return out;
+}
+)";
+
 // ============================================================================
 // GpuPipeline implementation
 // ============================================================================
@@ -222,6 +278,71 @@ bool GpuPipeline::init(SDL_GPUDevice* device,
         return false;
     }
 
+    // ----------------------------------------------------------------
+    // Ball instanced pipeline
+    // Vertex: ball_instanced_vs (BallGPUData per-instance, no index buffer)
+    // Fragment: sprite_fs (same texture+color blend as sprite pipeline)
+    // Targets: offscreen (same as sprite pipeline)
+    // ----------------------------------------------------------------
+    SDL_GPUShader* ball_vert = createShader(device, kBallInstancedVertMSL, "ball_instanced_vs",
+                                             SDL_GPU_SHADERSTAGE_VERTEX, 0, 0);
+    SDL_GPUShader* ball_frag = createShader(device, kSpriteFragMSL, "sprite_fs",
+                                             SDL_GPU_SHADERSTAGE_FRAGMENT, 1, 0);
+    if (!ball_vert || !ball_frag) {
+        SDL_Log("GpuPipeline: failed to create ball instanced shaders");
+        if (ball_vert) SDL_ReleaseGPUShader(device, ball_vert);
+        if (ball_frag) SDL_ReleaseGPUShader(device, ball_frag);
+        return false;
+    }
+
+    // Vertex input: BallGPUData as per-instance data (step rate = 1 instance)
+    SDL_GPUVertexBufferDescription ball_vb_desc = {};
+    ball_vb_desc.slot               = 0;
+    ball_vb_desc.pitch              = sizeof(BallGPUData);
+    ball_vb_desc.input_rate         = SDL_GPU_VERTEXINPUTRATE_INSTANCE;
+    ball_vb_desc.instance_step_rate = 1;
+
+    SDL_GPUVertexAttribute ball_attrs[3] = {};
+    // attr 0: center (float2) at offset 0
+    ball_attrs[0].location    = 0;
+    ball_attrs[0].buffer_slot = 0;
+    ball_attrs[0].format      = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT2;
+    ball_attrs[0].offset      = static_cast<Uint32>(offsetof(BallGPUData, cx));
+    // attr 1: half-size (float2) at offset 8
+    ball_attrs[1].location    = 1;
+    ball_attrs[1].buffer_slot = 0;
+    ball_attrs[1].format      = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT2;
+    ball_attrs[1].offset      = static_cast<Uint32>(offsetof(BallGPUData, hw));
+    // attr 2: color (float4) at offset 16
+    ball_attrs[2].location    = 2;
+    ball_attrs[2].buffer_slot = 0;
+    ball_attrs[2].format      = SDL_GPU_VERTEXELEMENTFORMAT_FLOAT4;
+    ball_attrs[2].offset      = static_cast<Uint32>(offsetof(BallGPUData, r));
+
+    SDL_GPUVertexInputState ball_vertex_input = {};
+    ball_vertex_input.vertex_buffer_descriptions = &ball_vb_desc;
+    ball_vertex_input.num_vertex_buffers         = 1;
+    ball_vertex_input.vertex_attributes          = ball_attrs;
+    ball_vertex_input.num_vertex_attributes      = 3;
+
+    SDL_GPUGraphicsPipelineCreateInfo ball_pipe_info = {};
+    ball_pipe_info.vertex_shader      = ball_vert;
+    ball_pipe_info.fragment_shader    = ball_frag;
+    ball_pipe_info.vertex_input_state = ball_vertex_input;
+    ball_pipe_info.primitive_type     = SDL_GPU_PRIMITIVETYPE_TRIANGLELIST;
+    ball_pipe_info.target_info.num_color_targets         = 1;
+    ball_pipe_info.target_info.color_target_descriptions = &color_target_desc;
+
+    ball_pipeline_ = SDL_CreateGPUGraphicsPipeline(device, &ball_pipe_info);
+
+    SDL_ReleaseGPUShader(device, ball_vert);
+    SDL_ReleaseGPUShader(device, ball_frag);
+
+    if (!ball_pipeline_) {
+        SDL_Log("GpuPipeline: ball instanced pipeline creation failed: %s", SDL_GetError());
+        return false;
+    }
+
     // ----------------------------------------------------------------
     // UI overlay pipeline (same as sprite but renders to swapchain format)
     // Reuse sprite shaders with different target format.
@@ -275,12 +396,13 @@ bool GpuPipeline::init(SDL_GPUDevice* device,
         return false;
     }
 
-    SDL_Log("GpuPipeline: sprite and postfx pipelines created successfully");
+    SDL_Log("GpuPipeline: all pipelines created successfully");
     return true;
 }
 
 void GpuPipeline::destroy(SDL_GPUDevice* device) {
     if (sprite_pipeline_) { SDL_ReleaseGPUGraphicsPipeline(device, sprite_pipeline_); sprite_pipeline_ = nullptr; }
+    if (ball_pipeline_)   { SDL_ReleaseGPUGraphicsPipeline(device, ball_pipeline_);   ball_pipeline_   = nullptr; }
     if (postfx_pipeline_) { SDL_ReleaseGPUGraphicsPipeline(device, postfx_pipeline_); postfx_pipeline_ = nullptr; }
 }
 
@@ -289,7 +411,8 @@ SDL_GPUShader* GpuPipeline::createShader(SDL_GPUDevice* device,
                                           const char* entrypoint,
                                           SDL_GPUShaderStage stage,
                                           Uint32 num_samplers,
-                                          Uint32 num_uniform_buffers) {
+                                          Uint32 num_uniform_buffers,
+                                          Uint32 num_storage_buffers) {
     SDL_GPUShaderCreateInfo info = {};
     info.code                = reinterpret_cast<const Uint8*>(msl_source);
     info.code_size           = static_cast<size_t>(strlen(msl_source) + 1);
@@ -298,7 +421,7 @@ SDL_GPUShader* GpuPipeline::createShader(SDL_GPUDevice* device,
     info.stage               = stage;
     info.num_samplers        = num_samplers;
     info.num_storage_textures = 0;
-    info.num_storage_buffers  = 0;
+    info.num_storage_buffers  = num_storage_buffers;
     info.num_uniform_buffers  = num_uniform_buffers;
 
     SDL_GPUShader* shader = SDL_CreateGPUShader(device, &info);
diff --git a/source/gpu/gpu_pipeline.hpp b/source/gpu/gpu_pipeline.hpp
index 8f1e409..fc5e05b 100644
--- a/source/gpu/gpu_pipeline.hpp
+++ b/source/gpu/gpu_pipeline.hpp
@@ -19,6 +19,9 @@ struct PostFXUniforms {
 //
 //  sprite_pipeline_ : textured quads, alpha blending.
 //                     Vertex layout: GpuVertex (pos float2, uv float2, col float4).
+//  ball_pipeline_   : instanced ball rendering, alpha blending.
+//                     Vertex layout: BallGPUData as per-instance data (input_rate=INSTANCE).
+//                     6 procedural vertices per instance (no index buffer).
 //  postfx_pipeline_ : full-screen triangle, no vertex buffer, no blend.
 //                     Reads offscreen texture, writes to swapchain.
 //                     Accepts PostFXUniforms via fragment uniform buffer slot 0.
@@ -33,7 +36,8 @@ public:
     void destroy(SDL_GPUDevice* device);
 
     SDL_GPUGraphicsPipeline* spritePipeline() const { return sprite_pipeline_; }
-    SDL_GPUGraphicsPipeline* postfxPipeline()  const { return postfx_pipeline_; }
+    SDL_GPUGraphicsPipeline* ballPipeline()   const { return ball_pipeline_; }
+    SDL_GPUGraphicsPipeline* postfxPipeline() const { return postfx_pipeline_; }
 
 private:
     SDL_GPUShader* createShader(SDL_GPUDevice* device,
@@ -41,8 +45,10 @@ private:
                                 const char* entrypoint,
                                 SDL_GPUShaderStage stage,
                                 Uint32 num_samplers,
-                                Uint32 num_uniform_buffers);
+                                Uint32 num_uniform_buffers,
+                                Uint32 num_storage_buffers = 0);
 
     SDL_GPUGraphicsPipeline* sprite_pipeline_ = nullptr;
+    SDL_GPUGraphicsPipeline* ball_pipeline_   = nullptr;
     SDL_GPUGraphicsPipeline* postfx_pipeline_ = nullptr;
 };
diff --git a/source/input/input_handler.cpp b/source/input/input_handler.cpp
index 55aff13..6730651 100644
--- a/source/input/input_handler.cpp
+++ b/source/input/input_handler.cpp
@@ -105,7 +105,7 @@ bool InputHandler::processEvents(Engine& engine) {
 
                 // Toggle Modo Boids (comportamiento de enjambre)
                 case SDLK_B:
-                    // engine.toggleBoidsMode();
+                     engine.toggleBoidsMode();
                     break;
 
                 // Ciclar temas de color (movido de B a C)