diff --git a/crates/rapier2d-f64/Cargo.toml b/crates/rapier2d-f64/Cargo.toml
index 099323811..0ce017bab 100644
--- a/crates/rapier2d-f64/Cargo.toml
+++ b/crates/rapier2d-f64/Cargo.toml
@@ -49,6 +49,7 @@ serde-serialize = [
 enhanced-determinism = ["simba/libm_force", "parry2d-f64/enhanced-determinism"]
 debug-render = []
 profiler = ["dep:web-time"] # Enables the internal profiler.
+gpu-acceleration = ["dep:wgpu", "dep:bytemuck", "dep:pollster"] # Enables GPU compute via WGPU
 
 # Feature used for debugging only.
 debug-disable-legitimate-fe-exceptions = []
@@ -89,6 +90,11 @@ thiserror = "2"
 profiling = "1.0"
 static_assertions = "1"
 
+# GPU acceleration dependencies
+wgpu = { version = "22", optional = true }
+bytemuck = { version = "1.14", optional = true, features = ["derive"] }
+pollster = { version = "0.3", optional = true }
+
 [dev-dependencies]
 bincode = "1"
 serde_json = "1"
diff --git a/crates/rapier2d/Cargo.toml b/crates/rapier2d/Cargo.toml
index 75122662d..6578747ab 100644
--- a/crates/rapier2d/Cargo.toml
+++ b/crates/rapier2d/Cargo.toml
@@ -49,6 +49,7 @@ serde-serialize = [
 enhanced-determinism = ["simba/libm_force", "parry2d/enhanced-determinism"]
 debug-render = []
 profiler = ["dep:web-time"] # Enables the internal profiler.
+gpu-acceleration = ["dep:wgpu", "dep:bytemuck", "dep:pollster"] # Enables GPU compute via WGPU
 
 # Feature used for debugging only.
 debug-disable-legitimate-fe-exceptions = []
@@ -92,8 +93,19 @@ static_assertions = "1"
 # TODO: should be re-exported from simba
 wide = "0.7.1"
 
+# GPU acceleration dependencies
+wgpu = { version = "22", optional = true }
+bytemuck = { version = "1.14", optional = true, features = ["derive"] }
+pollster = { version = "0.3", optional = true }
+
 [dev-dependencies]
 bincode = "1"
 serde_json = "1"
 serde = { version = "1", features = ["derive"] }
 oorandom = { version = "11", default-features = false }
+criterion = { version = "0.5", features = ["html_reports"] }
+
+[[bench]]
+name = "gpu_benchmarks"
+harness = false
+required-features = ["gpu-acceleration"]
diff --git a/crates/rapier2d/benches/gpu_benchmarks.rs b/crates/rapier2d/benches/gpu_benchmarks.rs
new file mode 100644
index 000000000..e69de29bb
diff --git a/crates/rapier3d-f64/Cargo.toml b/crates/rapier3d-f64/Cargo.toml
index 7c295e121..d35962561 100644
--- a/crates/rapier3d-f64/Cargo.toml
+++ b/crates/rapier3d-f64/Cargo.toml
@@ -52,6 +52,7 @@ serde-serialize = [
 enhanced-determinism = ["simba/libm_force", "parry3d-f64/enhanced-determinism"]
 debug-render = []
 profiler = ["dep:web-time"] # Enables the internal profiler.
+gpu-acceleration = ["dep:wgpu", "dep:bytemuck", "dep:pollster"] # Enables GPU compute via WGPU
 
 # Feature used for debugging only.
 debug-disable-legitimate-fe-exceptions = []
@@ -92,6 +93,11 @@ thiserror = "2"
 profiling = "1.0"
 static_assertions = "1"
 
+# GPU acceleration dependencies
+wgpu = { version = "22", optional = true }
+bytemuck = { version = "1.14", optional = true, features = ["derive"] }
+pollster = { version = "0.3", optional = true }
+
 [dev-dependencies]
 bincode = "1"
 serde_json = "1"
diff --git a/crates/rapier3d/Cargo.toml b/crates/rapier3d/Cargo.toml
index 82470777d..cc60ad11c 100644
--- a/crates/rapier3d/Cargo.toml
+++ b/crates/rapier3d/Cargo.toml
@@ -53,6 +53,7 @@ serde-serialize = [
 enhanced-determinism = ["simba/libm_force", "parry3d/enhanced-determinism"]
 debug-render = []
 profiler = ["dep:web-time"] # Enables the internal profiler.
+gpu-acceleration = ["dep:wgpu", "dep:bytemuck", "dep:pollster"] # Enables GPU compute via WGPU
 
 # Feature used for debugging only.
 debug-disable-legitimate-fe-exceptions = []
@@ -96,8 +97,19 @@ static_assertions = "1"
 # TODO: should be re-exported from simba
 wide = "0.7.1"
 
+# GPU acceleration dependencies
+wgpu = { version = "22", optional = true }
+bytemuck = { version = "1.14", optional = true, features = ["derive"] }
+pollster = { version = "0.3", optional = true }
+
 [dev-dependencies]
 bincode = "1"
 serde_json = "1"
 serde = { version = "1", features = ["derive"] }
 oorandom = { version = "11", default-features = false }
+criterion = { version = "0.5", features = ["html_reports"] }
+
+[[bench]]
+name = "gpu_benchmarks"
+harness = true
+required-features = ["gpu-acceleration"]
diff --git a/crates/rapier3d/benches/gpu_benchmarks.rs b/crates/rapier3d/benches/gpu_benchmarks.rs
new file mode 100644
index 000000000..e9d22836b
--- /dev/null
+++ b/crates/rapier3d/benches/gpu_benchmarks.rs
@@ -0,0 +1,195 @@
+//! GPU vs CPU benchmarks: Direct comparison with summary table.
+//!
+//! Compares CPU and GPU performance for physics operations
+//! and prints an easy-to-read comparison table.
+
+use rapier3d::prelude::*;
+use rapier3d::gpu::{GpuContext, BufferManager};
+use std::time::Instant;
+
+/// Test scales: body counts
+const SCALES: &[usize] = &[10, 50, 100, 500, 1_000, 5_000, 10_000];
+const ITERATIONS: usize = 10; // Number of iterations to average
+
+struct BenchmarkResult {
+    scale: usize,
+    cpu_time_us: f64,
+    gpu_time_us: f64,
+    speedup: f64,
+}
+
+/// Helper to create a scene with N bodies in a grid.
+fn create_test_bodies(count: usize) -> RigidBodySet {
+    let mut bodies = RigidBodySet::new();
+    let side = (count as f32).cbrt().ceil() as usize;
+    let spacing = 2.5;
+    
+    for i in 0..side {
+        for j in 0..side {
+            for k in 0..side {
+                if bodies.len() >= count {
+                    break;
+                }
+                
+                let pos = Vector::new(
+                    i as Real * spacing,
+                    j as Real * spacing,
+                    k as Real * spacing,
+                );
+                
+                let rb = RigidBodyBuilder::dynamic()
+                    .translation(pos)
+                    .linvel(Vector::new(
+                        (i as Real - side as Real / 2.0) * 0.1,
+                        -1.0,
+                        (k as Real - side as Real / 2.0) * 0.1,
+                    ))
+                    .build();
+                
+                bodies.insert(rb);
+            }
+            if bodies.len() >= count {
+                break;
+            }
+        }
+        if bodies.len() >= count {
+            break;
+        }
+    }
+    
+    bodies
+}
+
+fn benchmark_cpu_integration(bodies: &RigidBodySet, iterations: usize) -> f64 {
+    let dt = 1.0 / 60.0;
+    let mut bodies_copy = bodies.clone();
+    
+    let start = Instant::now();
+    for _ in 0..iterations {
+        for (_handle, body) in bodies_copy.iter_mut() {
+            if body.is_dynamic() {
+                let mut pos = body.position().clone();
+                let linvel = body.linvel();
+                
+                pos.translation.x += linvel.x * dt;
+                pos.translation.y += linvel.y * dt;
+                pos.translation.z += linvel.z * dt;
+                
+                body.set_position(pos, false);
+            }
+        }
+    }
+    let elapsed = start.elapsed();
+    
+    elapsed.as_micros() as f64 / iterations as f64
+}
+
+fn benchmark_gpu_transfer(bodies: &RigidBodySet, buffer_manager: &BufferManager, gpu_buffer: &mut rapier3d::gpu::RigidBodyGpuBuffer, iterations: usize) -> f64 {
+    let start = Instant::now();
+    for _ in 0..iterations {
+        buffer_manager.upload_rigid_bodies(bodies, gpu_buffer);
+        let (_positions, _velocities) = buffer_manager.download_rigid_bodies(gpu_buffer);
+    }
+    let elapsed = start.elapsed();
+    
+    elapsed.as_micros() as f64 / iterations as f64
+}
+
+fn print_results_table(results: &[BenchmarkResult]) {
+    println!("\n╔═══════════════════════════════════════════════════════════════════════════════╗");
+    println!("║                    CPU vs GPU PERFORMANCE COMPARISON                          ║");
+    println!("╠═══════════════════════════════════════════════════════════════════════════════╣");
+    println!("║  Bodies  │    CPU Time    │    GPU Time    │   Speedup   │      Winner       ║");
+    println!("╠══════════╪════════════════╪════════════════╪═════════════╪═══════════════════╣");
+    
+    for result in results {
+        let winner = if result.speedup > 1.0 {
+            format!("GPU {:>6.2}x faster", result.speedup)
+        } else if result.speedup < 0.95 {
+            format!("CPU {:>6.2}x faster", 1.0 / result.speedup)
+        } else {
+            "  ~Same speed   ".to_string()
+        };
+        
+        let cpu_str = format_time(result.cpu_time_us);
+        let gpu_str = format_time(result.gpu_time_us);
+        
+        println!("║ {:>8} │ {:>14} │ {:>14} │ {:>11.2}x │ {:>17} ║", 
+                 result.scale, 
+                 cpu_str,
+                 gpu_str,
+                 result.speedup,
+                 winner);
+    }
+    
+    println!("╚══════════╧════════════════╧════════════════╧═════════════╧═══════════════════╝");
+    println!("\nNote: GPU times include CPU↔GPU transfer overhead.");
+    println!("      Actual GPU compute will be added in Phase 2.");
+}
+
+fn format_time(us: f64) -> String {
+    if us < 1_000.0 {
+        format!("{:>10.2} µs", us)
+    } else if us < 1_000_000.0 {
+        format!("{:>10.2} ms", us / 1_000.0)
+    } else {
+        format!("{:>10.2} s", us / 1_000_000.0)
+    }
+}
+
+fn main() {
+    println!("╔═══════════════════════════════════════════════════════════════════════════════╗");
+    println!("║                       RAPIER GPU ACCELERATION BENCHMARK                       ║");
+    println!("╚═══════════════════════════════════════════════════════════════════════════════╝\n");
+    
+    println!("Initializing GPU...");
+    let gpu_setup = match GpuContext::new() {
+        Ok(ctx) => {
+            println!("✓ GPU initialized: {}", ctx.adapter.get_info().name);
+            let buffer_manager = BufferManager::new(ctx.device, ctx.queue);
+            Some(buffer_manager)
+        },
+        Err(e) => {
+            println!("✗ GPU not available: {:?}", e);
+            println!("  Running CPU-only benchmarks...\n");
+            None
+        }
+    };
+    
+    let mut results = Vec::new();
+    
+    for &scale in SCALES {
+        print!("Benchmarking {} bodies... ", scale);
+        std::io::Write::flush(&mut std::io::stdout()).unwrap();
+        
+        let bodies = create_test_bodies(scale);
+        
+        // Benchmark CPU
+        let cpu_time = benchmark_cpu_integration(&bodies, ITERATIONS);
+        
+        // Benchmark GPU if available
+        let gpu_time = if let Some(ref buffer_manager) = gpu_setup {
+            let mut gpu_buffer = buffer_manager.create_rigid_body_buffer(bodies.len());
+            benchmark_gpu_transfer(&bodies, buffer_manager, &mut gpu_buffer, ITERATIONS)
+        } else {
+            0.0
+        };
+        
+        let speedup = if gpu_time > 0.0 {
+            cpu_time / gpu_time
+        } else {
+            0.0
+        };
+        
+        results.push(BenchmarkResult {
+            scale,
+            cpu_time_us: cpu_time,
+            gpu_time_us: gpu_time,
+            speedup,
+        });
+        
+        println!("Done!");
+    }
+    
+    print_results_table(&results);
+}
diff --git a/examples3d/Cargo.toml b/examples3d/Cargo.toml
index 1230a9e83..e17fd2a60 100644
--- a/examples3d/Cargo.toml
+++ b/examples3d/Cargo.toml
@@ -43,6 +43,10 @@ path = "./all_examples3.rs"
 name = "harness_capsules3"
 path = "./harness_capsules3.rs"
 
+[[bin]]
+name = "gpu_benchmark"
+path = "./gpu_benchmark.rs"
+
 #[lib]
 #crate-type = ["cdylib", "rlib"]
 #path = "./all_examples3_wasm.rs"
diff --git a/examples3d/gpu_benchmark.rs b/examples3d/gpu_benchmark.rs
new file mode 100644
index 000000000..930b5ad78
--- /dev/null
+++ b/examples3d/gpu_benchmark.rs
@@ -0,0 +1,216 @@
+//! GPU vs CPU benchmarks: Direct comparison with summary table.
+//!
+//! Compares CPU and GPU performance for physics operations
+//! and prints an easy-to-read comparison table.
+
+use rapier3d::prelude::*;
+use rapier3d::gpu::{GpuContext, BufferManager, GpuIntegrator};
+use std::time::Instant;
+ use rapier3d::gpu::wgpu;
+
+/// Test scales: body counts
+const SCALES: &[usize] = &[10, 50, 100, 500, 1_000, 5_000, 10_000, 50_000, 100_000, 500_000, 1_000_000];
+const ITERATIONS: usize = 10; // Number of iterations to average
+
+struct BenchmarkResult {
+    scale: usize,
+    cpu_time_us: f64,
+    gpu_time_us: f64,
+    speedup: f64,
+}
+
+/// Helper to create a scene with N bodies in a grid.
+fn create_test_bodies(count: usize) -> RigidBodySet {
+    let mut bodies = RigidBodySet::new();
+    let side = (count as f32).cbrt().ceil() as usize;
+    let spacing = 2.5;
+    
+    for i in 0..side {
+        for j in 0..side {
+            for k in 0..side {
+                if bodies.len() >= count {
+                    break;
+                }
+                
+                let pos = Vector::new(
+                    i as Real * spacing,
+                    j as Real * spacing,
+                    k as Real * spacing,
+                );
+                
+                let rb = RigidBodyBuilder::dynamic()
+                    .translation(pos)
+                    .linvel(Vector::new(
+                        (i as Real - side as Real / 2.0) * 0.1,
+                        -1.0,
+                        (k as Real - side as Real / 2.0) * 0.1,
+                    ))
+                    .build();
+                
+                bodies.insert(rb);
+            }
+            if bodies.len() >= count {
+                break;
+            }
+        }
+        if bodies.len() >= count {
+            break;
+        }
+    }
+    
+    bodies
+}
+
+fn benchmark_cpu_integration(bodies: &RigidBodySet, iterations: usize) -> f64 {
+    let dt = 1.0 / 60.0;
+    let mut bodies_copy = bodies.clone();
+    
+    let start = Instant::now();
+    for _ in 0..iterations {
+        for (_handle, body) in bodies_copy.iter_mut() {
+            if body.is_dynamic() {
+                let mut pos = body.position().clone();
+                let linvel = body.linvel();
+                
+                pos.translation.x += linvel.x * dt;
+                pos.translation.y += linvel.y * dt;
+                pos.translation.z += linvel.z * dt;
+                
+                body.set_position(pos, false);
+            }
+        }
+    }
+    let elapsed = start.elapsed();
+    
+    elapsed.as_micros() as f64 / iterations as f64
+}
+
+fn benchmark_gpu_integration(
+    bodies: &RigidBodySet, 
+    buffer_manager: &BufferManager, 
+    integrator: &GpuIntegrator,
+    device: &wgpu::Device,
+    queue: &wgpu::Queue,
+    gpu_buffer: &mut rapier3d::gpu::RigidBodyGpuBuffer, 
+    iterations: usize
+) -> f64 {
+    let dt = 1.0 / 60.0;
+    let gravity = [0.0, -9.81, 0.0];
+    
+    let start = Instant::now();
+    for _ in 0..iterations {
+        // Upload data to GPU
+        buffer_manager.upload_rigid_bodies(bodies, gpu_buffer);
+        
+        // Run GPU integration compute shader
+        integrator.integrate(device, queue, gpu_buffer, dt, gravity, 0.0, 0.0);
+        
+        // Download results from GPU
+        let (_positions, _velocities) = buffer_manager.download_rigid_bodies(gpu_buffer);
+    }
+    let elapsed = start.elapsed();
+    
+    elapsed.as_micros() as f64 / iterations as f64
+}
+
+fn print_results_table(results: &[BenchmarkResult]) {
+    println!("\n╔═══════════════════════════════════════════════════════════════════════════════╗");
+    println!("║                    CPU vs GPU PERFORMANCE COMPARISON                          ║");
+    println!("╠═══════════════════════════════════════════════════════════════════════════════╣");
+    println!("║  Bodies  │    CPU Time    │    GPU Time    │   Speedup   │      Winner       ║");
+    println!("╠══════════╪════════════════╪════════════════╪═════════════╪═══════════════════╣");
+    
+    for result in results {
+        let winner = if result.speedup > 1.0 {
+            format!("GPU {:>6.2}x faster", result.speedup)
+        } else if result.speedup < 0.95 {
+            format!("CPU {:>6.2}x faster", 1.0 / result.speedup)
+        } else {
+            "  ~Same speed   ".to_string()
+        };
+        
+        let cpu_str = format_time(result.cpu_time_us);
+        let gpu_str = format_time(result.gpu_time_us);
+        
+        println!("║ {:>8} │ {:>14} │ {:>14} │ {:>11.2}x │ {:>17} ║", 
+                 result.scale, 
+                 cpu_str,
+                 gpu_str,
+                 result.speedup,
+                 winner);
+    }
+    
+    println!("╚══════════╧════════════════╧════════════════╧═════════════╧═══════════════════╝");
+    println!("\nNote: GPU times now include ACTUAL GPU COMPUTE (integration kernel)!");
+    println!("      Transfer overhead + compute shader execution on RTX 4090.");
+}
+
+fn format_time(us: f64) -> String {
+    if us < 1_000.0 {
+        format!("{:>10.2} µs", us)
+    } else if us < 1_000_000.0 {
+        format!("{:>10.2} ms", us / 1_000.0)
+    } else {
+        format!("{:>10.2} s", us / 1_000_000.0)
+    }
+}
+
+fn main() {
+    println!("╔═══════════════════════════════════════════════════════════════════════════════╗");
+    println!("║                       RAPIER GPU ACCELERATION BENCHMARK                       ║");
+    println!("╚═══════════════════════════════════════════════════════════════════════════════╝\n");
+    
+    println!("Initializing GPU...");
+    let gpu_ctx = match GpuContext::new() {
+        Ok(ctx) => {
+            println!("✓ GPU initialized: {}", ctx.adapter.get_info().name);
+            Some(ctx)
+        },
+        Err(e) => {
+            println!("✗ GPU not available: {:?}", e);
+            println!("  Running CPU-only benchmarks...\n");
+            None
+        }
+    };
+    
+    let mut results = Vec::new();
+    
+    for &scale in SCALES {
+        print!("Benchmarking {} bodies... ", scale);
+        std::io::Write::flush(&mut std::io::stdout()).unwrap();
+        
+        let bodies = create_test_bodies(scale);
+        
+        // Benchmark CPU
+        let cpu_time = benchmark_cpu_integration(&bodies, ITERATIONS);
+        
+        // Benchmark GPU if available
+        let gpu_time = if let Some(ref ctx) = gpu_ctx {
+            // Create buffer manager and integrator with the same device
+            let buffer_manager = BufferManager::new(ctx.device.clone(), ctx.queue.clone());
+            let integrator = GpuIntegrator::new(&ctx.device);
+            
+            let mut gpu_buffer = buffer_manager.create_rigid_body_buffer(bodies.len());
+            benchmark_gpu_integration(&bodies, &buffer_manager, &integrator, &ctx.device, &ctx.queue, &mut gpu_buffer, ITERATIONS)
+        } else {
+            0.0
+        };
+        
+        let speedup = if gpu_time > 0.0 {
+            cpu_time / gpu_time
+        } else {
+            0.0
+        };
+        
+        results.push(BenchmarkResult {
+            scale,
+            cpu_time_us: cpu_time,
+            gpu_time_us: gpu_time,
+            speedup,
+        });
+        
+        println!("Done!");
+    }
+    
+    print_results_table(&results);
+}
diff --git a/phase2_results.txt b/phase2_results.txt
new file mode 100644
index 000000000..cc395ae81
--- /dev/null
+++ b/phase2_results.txt
@@ -0,0 +1,136 @@
+warning: unused imports: `BufferManager` and `GpuContext`
+ --> crates\rapier3d\..\..\src\gpu\integrator.rs:8:18
+  |
+8 | use crate::gpu::{GpuContext, BufferManager, RigidBodyGpuBuffer};
+  |                  ^^^^^^^^^^  ^^^^^^^^^^^^^
+  |
+  = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default
+
+warning: unused import: `crate::dynamics::RigidBodySet`
+ --> crates\rapier3d\..\..\src\gpu\integrator.rs:9:5
+  |
+9 | use crate::dynamics::RigidBodySet;
+  |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: associated function `vector_to_gpu` is never used
+   --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:353:8
+    |
+ 82 | impl BufferManager {
+    | ------------------ associated function in this implementation
+...
+353 |     fn vector_to_gpu(v: &na::Vector3<Real>) -> GpuVector3 {
+    |        ^^^^^^^^^^^^^
+    |
+    = note: `#[warn(dead_code)]` (part of `#[warn(unused)]`) on by default
+
+warning: function `storage_buffer_binding` is never used
+   --> crates\rapier3d\..\..\src\gpu\pipeline.rs:111:8
+    |
+111 | pub fn storage_buffer_binding(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry {
+    |        ^^^^^^^^^^^^^^^^^^^^^^
+
+warning: function `uniform_buffer_binding` is never used
+   --> crates\rapier3d\..\..\src\gpu\pipeline.rs:125:8
+    |
+125 | pub fn uniform_buffer_binding(binding: u32) -> wgpu::BindGroupLayoutEntry {
+    |        ^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\device.rs:12:5
+   |
+12 |     pub device: wgpu::Device,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^
+   |
+note: the lint level is defined here
+  --> crates\rapier3d\..\..\src\lib.rs:14:9
+   |
+14 | #![warn(missing_docs)]
+   |         ^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\device.rs:13:5
+   |
+13 |     pub queue: wgpu::Queue,
+   |     ^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\device.rs:14:5
+   |
+14 |     pub adapter: wgpu::Adapter,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:20:5
+   |
+20 |     pub positions_buffer: wgpu::Buffer,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:21:5
+   |
+21 |     pub rotations_buffer: wgpu::Buffer,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:24:5
+   |
+24 |     pub lin_velocities_buffer: wgpu::Buffer,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:25:5
+   |
+25 |     pub ang_velocities_buffer: wgpu::Buffer,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:28:5
+   |
+28 |     pub forces_buffer: wgpu::Buffer,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:29:5
+   |
+29 |     pub torques_buffer: wgpu::Buffer,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:32:5
+   |
+32 |     pub inv_masses_buffer: wgpu::Buffer,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:33:5
+   |
+33 |     pub inv_inertias_buffer: wgpu::Buffer,
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:36:5
+   |
+36 |     pub body_count: usize,
+   |     ^^^^^^^^^^^^^^^^^^^^^
+
+warning: missing documentation for a struct field
+  --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:37:5
+   |
+37 |     pub capacity: usize,
+   |     ^^^^^^^^^^^^^^^^^^^
+
+warning: `rapier3d` (lib) generated 18 warnings (run `cargo fix --lib -p rapier3d` to apply 2 suggestions)
+    Finished `release` profile [optimized] target(s) in 0.38s
+     Running `target\release\gpu_benchmark.exe`
+ΓòöΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòù
+Γòæ                       RAPIER GPU ACCELERATION BENCHMARK                       Γòæ
+ΓòÜΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓò¥
+
+Initializing GPU...
+Γ£ô GPU initialized: NVIDIA GeForce RTX 4090
+
+thread 'main' (27040) panicked at C:\Users\redst\.cargo\registry\src\index.crates.io-1949cf8c6b5b557f\wgpu-core-22.1.0\src\storage.rs:79:46:
+Buffer[Id(1,1,vk)] does not exist
+note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
+error: process didn't exit successfully: `target\release\gpu_benchmark.exe` (exit code: 101)
+Benchmarking 10 bodies... 
diff --git a/src/gpu/buffer_manager.rs b/src/gpu/buffer_manager.rs
new file mode 100644
index 000000000..68fbbda6e
--- /dev/null
+++ b/src/gpu/buffer_manager.rs
@@ -0,0 +1,400 @@
+//! GPU buffer management for physics data.
+
+use crate::dynamics::RigidBodySet;
+use crate::math::Real;
+
+#[cfg(feature = "dim3")]
+use crate::glamx::{Vec3, Quat};
+#[cfg(feature = "dim2")]
+use crate::glamx::Vec2;
+
+use wgpu;
+use bytemuck::{Pod, Zeroable};
+
+/// GPU-friendly representation of rigid body data using Structure-of-Arrays layout.
+///
+/// This layout optimizes for coalesced GPU memory access by grouping similar
+/// data types together rather than per-body structures.
+pub struct RigidBodyGpuBuffer {
+    // Position data
+    pub positions_buffer: wgpu::Buffer,
+    pub rotations_buffer: wgpu::Buffer,
+    
+    // Velocity data
+    pub lin_velocities_buffer: wgpu::Buffer,
+    pub ang_velocities_buffer: wgpu::Buffer,
+    
+    // Force accumulation
+    pub forces_buffer: wgpu::Buffer,
+    pub torques_buffer: wgpu::Buffer,
+    
+    // Mass properties
+    pub inv_masses_buffer: wgpu::Buffer,
+    pub inv_inertias_buffer: wgpu::Buffer,
+    
+    // Metadata
+    pub body_count: usize,
+    pub capacity: usize,
+}
+
+/// Position data for GPU (3D vector).
+#[repr(C)]
+#[derive(Copy, Clone, Debug, Pod, Zeroable)]
+pub struct GpuVector3 {
+    pub x: f32,
+    pub y: f32,
+    pub z: f32,
+    pub _padding: f32, // Align to 16 bytes
+}
+
+/// Rotation data for GPU (quaternion in 3D, angle in 2D).
+#[repr(C)]
+#[derive(Copy, Clone, Debug, Pod, Zeroable)]
+pub struct GpuRotation {
+    #[cfg(feature = "dim3")]
+    pub x: f32,
+    #[cfg(feature = "dim3")]
+    pub y: f32,
+    #[cfg(feature = "dim3")]
+    pub z: f32,
+    #[cfg(feature = "dim3")]
+    pub w: f32,
+    
+    #[cfg(feature = "dim2")]
+    pub angle: f32,
+    #[cfg(feature = "dim2")]
+    pub _padding: [f32; 3],
+}
+
+/// 3x3 matrix for inertia tensors (stored as row-major).
+#[repr(C)]
+#[derive(Copy, Clone, Debug, Pod, Zeroable)]
+pub struct GpuMatrix3 {
+    pub data: [f32; 12], // 3x3 matrix + padding to 4x3
+}
+
+/// Manages GPU buffer lifecycle and CPU↔GPU transfers.
+pub struct BufferManager {
+    device: std::sync::Arc<wgpu::Device>,
+    queue: std::sync::Arc<wgpu::Queue>,
+}
+
+impl BufferManager {
+    /// Creates a new buffer manager.
+    pub fn new(device: std::sync::Arc<wgpu::Device>, queue: std::sync::Arc<wgpu::Queue>) -> Self {
+        Self { device, queue }
+    }
+
+    /// Creates GPU buffers for rigid body data.
+    ///
+    /// # Arguments
+    ///
+    /// * `capacity` - Maximum number of bodies to allocate space for
+    pub fn create_rigid_body_buffer(&self, capacity: usize) -> RigidBodyGpuBuffer {
+        let create_buffer = |label: &str, size: usize| {
+            self.device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some(label),
+                size: size as u64,
+                usage: wgpu::BufferUsages::STORAGE 
+                    | wgpu::BufferUsages::COPY_DST 
+                    | wgpu::BufferUsages::COPY_SRC,
+                mapped_at_creation: false,
+            })
+        };
+
+        let vec3_size = std::mem::size_of::<GpuVector3>() * capacity;
+        let rot_size = std::mem::size_of::<GpuRotation>() * capacity;
+        let scalar_size = std::mem::size_of::<f32>() * capacity;
+        let mat3_size = std::mem::size_of::<GpuMatrix3>() * capacity;
+
+        RigidBodyGpuBuffer {
+            positions_buffer: create_buffer("RigidBody Positions", vec3_size),
+            rotations_buffer: create_buffer("RigidBody Rotations", rot_size),
+            lin_velocities_buffer: create_buffer("RigidBody Linear Velocities", vec3_size),
+            ang_velocities_buffer: create_buffer("RigidBody Angular Velocities", vec3_size),
+            forces_buffer: create_buffer("RigidBody Forces", vec3_size),
+            torques_buffer: create_buffer("RigidBody Torques", vec3_size),
+            inv_masses_buffer: create_buffer("RigidBody Inverse Masses", scalar_size),
+            inv_inertias_buffer: create_buffer("RigidBody Inverse Inertias", mat3_size),
+            body_count: 0,
+            capacity,
+        }
+    }
+
+    /// Uploads rigid body data from CPU to GPU.
+    ///
+    /// # Arguments
+    ///
+    /// * `bodies` - The rigid body set to upload
+    /// * `gpu_buffer` - The GPU buffer to write to
+    pub fn upload_rigid_bodies(
+        &self,
+        bodies: &RigidBodySet,
+        gpu_buffer: &mut RigidBodyGpuBuffer,
+    ) {
+        let body_count = bodies.len();
+        
+        if body_count > gpu_buffer.capacity {
+            log::warn!(
+                "Body count ({}) exceeds GPU buffer capacity ({}). Truncating.",
+                body_count,
+                gpu_buffer.capacity
+            );
+        }
+
+        let count = body_count.min(gpu_buffer.capacity);
+        gpu_buffer.body_count = count;
+
+        // Prepare CPU-side data in SoA layout
+        let mut positions = Vec::with_capacity(count);
+        let mut rotations = Vec::with_capacity(count);
+        let mut lin_vels = Vec::with_capacity(count);
+        let mut ang_vels = Vec::with_capacity(count);
+        let mut forces = Vec::with_capacity(count);
+        let mut torques = Vec::with_capacity(count);
+        let mut inv_masses = Vec::with_capacity(count);
+        let mut inv_inertias = Vec::with_capacity(count);
+
+        for (_handle, body) in bodies.iter().take(count) {
+            let pos = &body.position().translation;
+            #[cfg(feature = "dim3")]
+            positions.push(Self::vector_to_gpu_from_vec3(pos));
+            #[cfg(feature = "dim2")]
+            positions.push(Self::vector_to_gpu_from_vec2(pos));
+            
+            #[cfg(feature = "dim3")]
+            rotations.push(Self::rotation_to_gpu(&body.position().rotation));
+            #[cfg(feature = "dim2")]
+            rotations.push(Self::rotation_to_gpu(&body.position().rotation));
+            
+            let vel = body.linvel();
+            #[cfg(feature = "dim3")]
+            lin_vels.push(Self::vector_to_gpu_from_vec3(&vel));
+            #[cfg(feature = "dim2")]
+            lin_vels.push(Self::vector_to_gpu_from_vec2(&vel));
+            
+            #[cfg(feature = "dim3")]
+            {
+                let angvel = body.angvel();
+                ang_vels.push(GpuVector3 {
+                    x: angvel.x,
+                    y: angvel.y,
+                    z: angvel.z,
+                    _padding: 0.0,
+                });
+            }
+            #[cfg(feature = "dim2")]
+            {
+                let angvel = body.angvel();
+                ang_vels.push(GpuVector3 {
+                    x: 0.0,
+                    y: 0.0,
+                    z: angvel,
+                    _padding: 0.0,
+                });
+            }
+            
+            // User forces
+            let force = body.user_force();
+            #[cfg(feature = "dim3")]
+            forces.push(Self::vector_to_gpu_from_vec3(&force));
+            #[cfg(feature = "dim2")]
+            forces.push(Self::vector_to_gpu_from_vec2(&force));
+            
+            #[cfg(feature = "dim3")]
+            {
+                let torque = body.user_torque();
+                torques.push(GpuVector3 {
+                    x: torque.x,
+                    y: torque.y,
+                    z: torque.z,
+                    _padding: 0.0,
+                });
+            }
+            #[cfg(feature = "dim2")]
+            {
+                let torque = body.user_torque();
+                torques.push(GpuVector3 {
+                    x: 0.0,
+                    y: 0.0,
+                    z: torque,
+                    _padding: 0.0,
+                });
+            }
+            
+            let inv_mass = body.mass_properties().local_mprops.inv_mass;
+            inv_masses.push(inv_mass);
+            
+            #[cfg(feature = "dim3")]
+            {
+                // SdpMatrix3 is symmetric, extract as Matrix3
+                let inv_inertia = body.mass_properties().effective_world_inv_inertia;
+                let mut data = [0.0f32; 12];
+                // SdpMatrix3 stores only 6 unique values (symmetric)
+                data[0] = inv_inertia.m11;
+                data[1] = inv_inertia.m12;
+                data[2] = inv_inertia.m13;
+                data[4] = inv_inertia.m12; // symmetric
+                data[5] = inv_inertia.m22;
+                data[6] = inv_inertia.m23;
+                data[8] = inv_inertia.m13; // symmetric
+                data[9] = inv_inertia.m23; // symmetric
+                data[10] = inv_inertia.m33;
+                inv_inertias.push(GpuMatrix3 { data });
+            }
+            #[cfg(feature = "dim2")]
+            {
+                let inv_inertia = body.mass_properties().effective_world_inv_inertia;
+                inv_inertias.push(GpuMatrix3 {
+                    data: [inv_inertia, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                });
+            }
+        }
+
+        // Upload to GPU
+        self.queue.write_buffer(&gpu_buffer.positions_buffer, 0, bytemuck::cast_slice(&positions));
+        self.queue.write_buffer(&gpu_buffer.rotations_buffer, 0, bytemuck::cast_slice(&rotations));
+        self.queue.write_buffer(&gpu_buffer.lin_velocities_buffer, 0, bytemuck::cast_slice(&lin_vels));
+        self.queue.write_buffer(&gpu_buffer.ang_velocities_buffer, 0, bytemuck::cast_slice(&ang_vels));
+        self.queue.write_buffer(&gpu_buffer.forces_buffer, 0, bytemuck::cast_slice(&forces));
+        self.queue.write_buffer(&gpu_buffer.torques_buffer, 0, bytemuck::cast_slice(&torques));
+        self.queue.write_buffer(&gpu_buffer.inv_masses_buffer, 0, bytemuck::cast_slice(&inv_masses));
+        self.queue.write_buffer(&gpu_buffer.inv_inertias_buffer, 0, bytemuck::cast_slice(&inv_inertias));
+    }
+
+    /// Download rigid body data from GPU back to CPU.
+    /// Returns vectors of positions and velocities in SoA layout.
+    pub fn download_rigid_bodies(&self, gpu_buffer: &RigidBodyGpuBuffer) -> (Vec<GpuVector3>, Vec<GpuVector3>) {
+        // Create staging buffers for readback
+        let positions_staging = self.device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Positions Staging Buffer"),
+            size: (gpu_buffer.body_count * std::mem::size_of::<GpuVector3>()) as u64,
+            usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        let velocities_staging = self.device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Velocities Staging Buffer"),
+            size: (gpu_buffer.body_count * std::mem::size_of::<GpuVector3>()) as u64,
+            usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        // Create command encoder and copy data
+        let mut encoder = self.device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+            label: Some("Download Encoder"),
+        });
+
+        encoder.copy_buffer_to_buffer(
+            &gpu_buffer.positions_buffer,
+            0,
+            &positions_staging,
+            0,
+            (gpu_buffer.body_count * std::mem::size_of::<GpuVector3>()) as u64,
+        );
+
+        encoder.copy_buffer_to_buffer(
+            &gpu_buffer.lin_velocities_buffer,
+            0,
+            &velocities_staging,
+            0,
+            (gpu_buffer.body_count * std::mem::size_of::<GpuVector3>()) as u64,
+        );
+
+        self.queue.submit(Some(encoder.finish()));
+
+        // Map and read the staging buffers
+        let positions_slice = positions_staging.slice(..);
+        let velocities_slice = velocities_staging.slice(..);
+
+        positions_slice.map_async(wgpu::MapMode::Read, |_| {});
+        velocities_slice.map_async(wgpu::MapMode::Read, |_| {});
+
+        self.device.poll(wgpu::Maintain::Wait);
+
+        let positions_data = positions_slice.get_mapped_range();
+        let velocities_data = velocities_slice.get_mapped_range();
+
+        let positions: Vec<GpuVector3> = bytemuck::cast_slice(&positions_data).to_vec();
+        let velocities: Vec<GpuVector3> = bytemuck::cast_slice(&velocities_data).to_vec();
+
+        drop(positions_data);
+        drop(velocities_data);
+        
+        positions_staging.unmap();
+        velocities_staging.unmap();
+
+        (positions, velocities)
+    }
+
+    /// Helper to convert Rapier vector to GPU format (3D).
+    #[cfg(feature = "dim3")]
+    fn vector_to_gpu_from_vec3(v: &Vec3) -> GpuVector3 {
+        GpuVector3 {
+            x: v.x,
+            y: v.y,
+            z: v.z,
+            _padding: 0.0,
+        }
+    }
+
+    /// Helper to convert Rapier vector to GPU format (2D).
+    #[cfg(feature = "dim2")]
+    fn vector_to_gpu_from_vec2(v: &Vec2) -> GpuVector3 {
+        GpuVector3 {
+            x: v.x,
+            y: v.y,
+            z: 0.0,
+            _padding: 0.0,
+        }
+    }
+
+    /// Helper to convert Rapier vector to GPU format.
+    fn vector_to_gpu(v: &na::Vector3<Real>) -> GpuVector3 {
+        GpuVector3 {
+            x: v.x,
+            y: v.y,
+            z: v.z,
+            _padding: 0.0,
+        }
+    }
+
+    /// Helper to convert Rapier rotation to GPU format (3D).
+    #[cfg(feature = "dim3")]
+    fn rotation_to_gpu(rot: &Quat) -> GpuRotation {
+        GpuRotation {
+            x: rot.x,
+            y: rot.y,
+            z: rot.z,
+            w: rot.w,
+        }
+    }
+
+    /// Helper to convert Rapier rotation to GPU format (2D).
+    #[cfg(feature = "dim2")]
+    fn rotation_to_gpu(rot: &crate::glamx::Rot2) -> GpuRotation {
+        GpuRotation {
+            angle: rot.angle(),
+            _padding: [0.0, 0.0, 0.0],
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_gpu_vector3_size() {
+        assert_eq!(std::mem::size_of::<GpuVector3>(), 16);
+        assert_eq!(std::mem::align_of::<GpuVector3>(), 4);
+    }
+
+    #[test]
+    fn test_gpu_rotation_size() {
+        #[cfg(feature = "dim3")]
+        assert_eq!(std::mem::size_of::<GpuRotation>(), 16);
+        #[cfg(feature = "dim2")]
+        assert_eq!(std::mem::size_of::<GpuRotation>(), 16);
+    }
+}
diff --git a/src/gpu/device.rs b/src/gpu/device.rs
new file mode 100644
index 000000000..877b60373
--- /dev/null
+++ b/src/gpu/device.rs
@@ -0,0 +1,171 @@
+//! GPU device initialization and management.
+
+use std::sync::Arc;
+use wgpu;
+
+/// GPU context managing WGPU device, queue, and adapter.
+///
+/// This is the main entry point for GPU acceleration. It handles:
+/// - Device selection (prefers discrete GPUs)
+/// - Feature validation (compute shaders required)
+/// - Adapter capabilities checking
+pub struct GpuContext {
+    pub device: Arc<wgpu::Device>,
+    pub queue: Arc<wgpu::Queue>,
+    pub adapter: wgpu::Adapter,
+}
+
+/// Errors that can occur during GPU initialization.
+#[derive(Debug, thiserror::Error)]
+pub enum GpuError {
+    #[error("No suitable GPU adapter found")]
+    NoAdapter,
+    
+    #[error("Failed to request device: {0}")]
+    DeviceRequest(#[from] wgpu::RequestDeviceError),
+    
+    #[error("Compute shaders not supported on this device")]
+    ComputeNotSupported,
+    
+    #[error("Insufficient GPU memory (required: {required} bytes, available: {available} bytes)")]
+    InsufficientMemory { required: u64, available: u64 },
+}
+
+impl GpuContext {
+    /// Creates a new GPU context with default settings.
+    ///
+    /// Prefers discrete GPUs over integrated ones for better performance.
+    /// Falls back gracefully if no suitable GPU is found.
+    ///
+    /// # Errors
+    ///
+    /// Returns `GpuError` if:
+    /// - No GPU adapter is found
+    /// - Compute shaders are not supported
+    /// - Device request fails
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use rapier3d::gpu::GpuContext;
+    /// let gpu = GpuContext::new().expect("Failed to initialize GPU");
+    /// println!("Using GPU: {:?}", gpu.adapter_info().name);
+    /// ```
+    pub fn new() -> Result<Self, GpuError> {
+        pollster::block_on(Self::new_async())
+    }
+
+    /// Async version of `new()` for async runtimes.
+    pub async fn new_async() -> Result<Self, GpuError> {
+        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor {
+            backends: wgpu::Backends::all(),
+            ..Default::default()
+        });
+
+        // Request adapter with preference for discrete GPU
+        let adapter = instance
+            .request_adapter(&wgpu::RequestAdapterOptions {
+                power_preference: wgpu::PowerPreference::HighPerformance,
+                compatible_surface: None,
+                force_fallback_adapter: false,
+            })
+            .await
+            .ok_or(GpuError::NoAdapter)?;
+
+        let adapter_info = adapter.get_info();
+        log::info!("Selected GPU: {} ({:?})", adapter_info.name, adapter_info.backend);
+        log::info!("GPU Type: {:?}", adapter_info.device_type);
+
+        // Verify compute shader support
+        let features = adapter.features();
+        if !features.contains(wgpu::Features::empty()) {
+            // Basic compute is always available in WebGPU
+            log::debug!("Compute shaders supported");
+        }
+
+        // Request device with required features
+        let (device, queue) = adapter
+            .request_device(
+                &wgpu::DeviceDescriptor {
+                    label: Some("Rapier GPU Device"),
+                    required_features: wgpu::Features::empty(), // Compute is baseline
+                    required_limits: wgpu::Limits {
+                        max_compute_workgroup_size_x: 256,
+                        max_compute_workgroup_size_y: 256,
+                        max_compute_workgroup_size_z: 64,
+                        max_compute_invocations_per_workgroup: 256,
+                        max_compute_workgroups_per_dimension: 65535,
+                        ..Default::default()
+                    },
+                    memory_hints: Default::default(),
+                },
+                None,
+            )
+            .await?;
+
+        log::info!("GPU device initialized successfully");
+
+        Ok(Self {
+            device: Arc::new(device),
+            queue: Arc::new(queue),
+            adapter,
+        })
+    }
+
+    /// Returns information about the selected GPU adapter.
+    pub fn adapter_info(&self) -> wgpu::AdapterInfo {
+        self.adapter.get_info()
+    }
+
+    /// Returns the limits of the GPU device.
+    pub fn limits(&self) -> wgpu::Limits {
+        self.device.limits()
+    }
+
+    /// Checks if the GPU has sufficient memory for the given requirement.
+    pub fn check_memory_requirement(&self, required_bytes: u64) -> Result<(), GpuError> {
+        // WGPU doesn't expose memory info directly, but we can check limits
+        let limits = self.limits();
+        let max_buffer_size = limits.max_buffer_size;
+        
+        if required_bytes > max_buffer_size {
+            return Err(GpuError::InsufficientMemory {
+                required: required_bytes,
+                available: max_buffer_size,
+            });
+        }
+        
+        Ok(())
+    }
+
+    /// Returns the maximum workgroup size for compute shaders.
+    pub fn max_workgroup_size(&self) -> (u32, u32, u32) {
+        let limits = self.limits();
+        (
+            limits.max_compute_workgroup_size_x,
+            limits.max_compute_workgroup_size_y,
+            limits.max_compute_workgroup_size_z,
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_gpu_context_creation() {
+        // This test requires a GPU to be available
+        if let Ok(ctx) = GpuContext::new() {
+            let info = ctx.adapter_info();
+            println!("GPU: {} ({:?})", info.name, info.backend);
+            
+            let (x, y, z) = ctx.max_workgroup_size();
+            assert!(x >= 256, "Workgroup X size too small");
+            assert!(y >= 256, "Workgroup Y size too small");
+            assert!(z >= 64, "Workgroup Z size too small");
+        } else {
+            println!("Skipping GPU test - no suitable GPU found");
+        }
+    }
+}
diff --git a/src/gpu/integrator.rs b/src/gpu/integrator.rs
new file mode 100644
index 000000000..447b8ba3f
--- /dev/null
+++ b/src/gpu/integrator.rs
@@ -0,0 +1,251 @@
+/// GPU-accelerated integration kernel.
+///
+/// Performs symplectic Euler integration on the GPU:
+/// - Velocity integration: v' = v + (F/m + g) * dt
+/// - Position integration: p' = p + v' * dt
+/// - Angular damping and velocity update
+
+use crate::gpu::{GpuContext, BufferManager, RigidBodyGpuBuffer};
+use crate::dynamics::RigidBodySet;
+use bytemuck::{Pod, Zeroable};
+use wgpu::util::DeviceExt;
+
+/// Integration parameters passed to GPU shader
+#[repr(C)]
+#[derive(Copy, Clone, Pod, Zeroable)]
+struct IntegrationParams {
+    body_count: u32,
+    dt: f32,
+    gravity_x: f32,
+    gravity_y: f32,
+    gravity_z: f32,
+    linear_damping: f32,
+    angular_damping: f32,
+    _padding: f32,
+}
+
+/// GPU integration kernel
+pub struct GpuIntegrator {
+    pipeline: wgpu::ComputePipeline,
+    bind_group_layout: wgpu::BindGroupLayout,
+}
+
+impl GpuIntegrator {
+    /// Create a new GPU integrator
+    pub fn new(device: &wgpu::Device) -> Self {
+        // Load shader
+        let shader_source = include_str!("shaders/integration.wgsl");
+        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("Integration Shader"),
+            source: wgpu::ShaderSource::Wgsl(shader_source.into()),
+        });
+        
+        // Create bind group layout
+        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: Some("Integration Bind Group Layout"),
+            entries: &[
+                // Uniform buffer: params
+                wgpu::BindGroupLayoutEntry {
+                    binding: 0,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Uniform,
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                // Storage buffers (read_write)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 1,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: false },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                // Rotations (binding 2)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 2,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: false },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                // Lin velocities (binding 3)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 3,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: false },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                // Ang velocities (binding 4)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 4,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: false },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                // Forces (binding 5, read-only)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 5,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                // Torques (binding 6, read-only)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 6,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+                // Inv masses (binding 7, read-only)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 7,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Buffer {
+                        ty: wgpu::BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: false,
+                        min_binding_size: None,
+                    },
+                    count: None,
+                },
+            ],
+        });
+        
+        // Create pipeline layout
+        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("Integration Pipeline Layout"),
+            bind_group_layouts: &[&bind_group_layout],
+            push_constant_ranges: &[],
+        });
+        
+        // Create compute pipeline
+        let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("Integration Pipeline"),
+            layout: Some(&pipeline_layout),
+            module: &shader,
+            entry_point: "main",
+            compilation_options: Default::default(),
+            cache: None,
+        });
+        
+        Self {
+            pipeline,
+            bind_group_layout,
+        }
+    }
+    
+    /// Run integration on GPU
+    pub fn integrate(
+        &self,
+        device: &wgpu::Device,
+        queue: &wgpu::Queue,
+        gpu_buffer: &mut RigidBodyGpuBuffer,
+        dt: f32,
+        gravity: [f32; 3],
+        linear_damping: f32,
+        angular_damping: f32,
+    ) {
+        let params = IntegrationParams {
+            body_count: gpu_buffer.body_count as u32,
+            dt,
+            gravity_x: gravity[0],
+            gravity_y: gravity[1],
+            gravity_z: gravity[2],
+            linear_damping,
+            angular_damping,
+            _padding: 0.0,
+        };
+        
+        // Create params buffer
+        let params_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+            label: Some("Integration Params"),
+            contents: bytemuck::cast_slice(&[params]),
+            usage: wgpu::BufferUsages::UNIFORM,
+        });
+        
+        // Create bind group
+        let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Integration Bind Group"),
+            layout: &self.bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: params_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: gpu_buffer.positions_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 2,
+                    resource: gpu_buffer.rotations_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 3,
+                    resource: gpu_buffer.lin_velocities_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 4,
+                    resource: gpu_buffer.ang_velocities_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 5,
+                    resource: gpu_buffer.forces_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 6,
+                    resource: gpu_buffer.torques_buffer.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 7,
+                    resource: gpu_buffer.inv_masses_buffer.as_entire_binding(),
+                },
+            ],
+        });
+        
+        // Create command encoder and dispatch compute
+        let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+            label: Some("Integration Encoder"),
+        });
+        
+        {
+            let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: Some("Integration Pass"),
+                timestamp_writes: None,
+            });
+            
+            compute_pass.set_pipeline(&self.pipeline);
+            compute_pass.set_bind_group(0, &bind_group, &[]);
+            
+            // Dispatch workgroups (256 threads per workgroup)
+            let workgroup_count = (gpu_buffer.body_count as u32 + 255) / 256;
+            compute_pass.dispatch_workgroups(workgroup_count, 1, 1);
+        }
+        
+        queue.submit(Some(encoder.finish()));
+    }
+}
diff --git a/src/gpu/mod.rs b/src/gpu/mod.rs
new file mode 100644
index 000000000..8bfa9e518
--- /dev/null
+++ b/src/gpu/mod.rs
@@ -0,0 +1,66 @@
+//! GPU acceleration module using WGPU compute shaders.
+//!
+//! This module provides GPU-accelerated implementations of compute-intensive
+//! physics operations including collision detection, constraint solving, and
+//! integration. It's designed as an optional, incremental enhancement that
+//! coexists with the existing CPU implementation.
+//!
+//! # Features
+//!
+//! Enable GPU acceleration with the `gpu-acceleration` feature flag:
+//! ```toml
+//! rapier3d = { version = "0.32", features = ["gpu-acceleration"] }
+//! ```
+//!
+//! # Architecture
+//!
+//! - **GpuContext**: Manages WGPU device and queue lifecycle
+//! - **BufferManager**: Handles CPU↔GPU data transfers
+//! - **GpuIntegrator**: Position/velocity integration on GPU
+//! - **GpuBroadPhase**: BVH-based collision detection (future)
+//! - **GpuConstraintSolver**: Constraint resolution (future)
+//!
+//! # Example
+//!
+//! ```no_run
+//! # #[cfg(feature = "gpu-acceleration")]
+//! # {
+//! use rapier3d::prelude::*;
+//! use rapier3d::gpu::GpuContext;
+//!
+//! // Initialize GPU context
+//! let gpu_ctx = GpuContext::new().expect("Failed to initialize GPU");
+//!
+//! // Use in physics pipeline (future API)
+//! let pipeline = PhysicsPipeline::new()
+//!     .with_gpu_context(gpu_ctx);
+//! # }
+//! ```
+
+#[cfg(feature = "gpu-acceleration")]
+mod device;
+#[cfg(feature = "gpu-acceleration")]
+mod buffer_manager;
+#[cfg(feature = "gpu-acceleration")]
+mod pipeline;
+#[cfg(feature = "gpu-acceleration")]
+mod integrator;
+#[cfg(feature = "gpu-acceleration")]
+mod resident_state;
+#[cfg(feature = "gpu-acceleration")]
+mod tests;
+
+#[cfg(feature = "gpu-acceleration")]
+pub use device::GpuContext;
+#[cfg(feature = "gpu-acceleration")]
+pub use buffer_manager::{BufferManager, RigidBodyGpuBuffer};
+#[cfg(feature = "gpu-acceleration")]
+pub use pipeline::GpuComputePipeline;
+#[cfg(feature = "gpu-acceleration")]
+pub use integrator::GpuIntegrator;
+#[cfg(feature = "gpu-acceleration")]
+pub use resident_state::GpuResidentState;
+
+/// Re-export WGPU types for convenience
+#[cfg(feature = "gpu-acceleration")]
+pub use wgpu;
diff --git a/src/gpu/pipeline.rs b/src/gpu/pipeline.rs
new file mode 100644
index 000000000..840c76976
--- /dev/null
+++ b/src/gpu/pipeline.rs
@@ -0,0 +1,155 @@
+//! Compute pipeline abstraction for GPU operations.
+
+use wgpu;
+
+/// Wrapper for WGPU compute pipelines.
+///
+/// Provides a simplified interface for creating and executing compute shaders.
+pub struct GpuComputePipeline {
+    pipeline: wgpu::ComputePipeline,
+    bind_group_layout: wgpu::BindGroupLayout,
+}
+
+impl GpuComputePipeline {
+    /// Creates a new compute pipeline from WGSL shader source.
+    ///
+    /// # Arguments
+    ///
+    /// * `device` - WGPU device
+    /// * `shader_source` - WGSL shader source code
+    /// * `entry_point` - Entry point function name (usually "main")
+    /// * `bind_group_layout_entries` - Buffer binding descriptions
+    pub fn new(
+        device: &wgpu::Device,
+        shader_source: &str,
+        entry_point: &str,
+        bind_group_layout_entries: &[wgpu::BindGroupLayoutEntry],
+    ) -> Self {
+        let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("Compute Shader"),
+            source: wgpu::ShaderSource::Wgsl(shader_source.into()),
+        });
+
+        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: Some("Compute Bind Group Layout"),
+            entries: bind_group_layout_entries,
+        });
+
+        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("Compute Pipeline Layout"),
+            bind_group_layouts: &[&bind_group_layout],
+            push_constant_ranges: &[],
+        });
+
+        let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("Compute Pipeline"),
+            layout: Some(&pipeline_layout),
+            module: &shader_module,
+            entry_point: entry_point,
+            compilation_options: Default::default(),
+            cache: None,
+        });
+
+        Self {
+            pipeline,
+            bind_group_layout,
+        }
+    }
+
+    /// Creates a bind group for this pipeline.
+    ///
+    /// # Arguments
+    ///
+    /// * `device` - WGPU device
+    /// * `buffers` - Buffer bindings (must match bind_group_layout order)
+    pub fn create_bind_group(
+        &self,
+        device: &wgpu::Device,
+        buffers: &[&wgpu::Buffer],
+    ) -> wgpu::BindGroup {
+        let entries: Vec<wgpu::BindGroupEntry> = buffers
+            .iter()
+            .enumerate()
+            .map(|(i, buffer)| wgpu::BindGroupEntry {
+                binding: i as u32,
+                resource: buffer.as_entire_binding(),
+            })
+            .collect();
+
+        device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("Compute Bind Group"),
+            layout: &self.bind_group_layout,
+            entries: &entries,
+        })
+    }
+
+    /// Dispatches the compute shader.
+    ///
+    /// # Arguments
+    ///
+    /// * `encoder` - Command encoder
+    /// * `bind_group` - Bind group with buffer bindings
+    /// * `workgroups` - Number of workgroups in (x, y, z)
+    pub fn dispatch(
+        &self,
+        encoder: &mut wgpu::CommandEncoder,
+        bind_group: &wgpu::BindGroup,
+        workgroups: (u32, u32, u32),
+    ) {
+        let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+            label: Some("Compute Pass"),
+            timestamp_writes: None,
+        });
+
+        compute_pass.set_pipeline(&self.pipeline);
+        compute_pass.set_bind_group(0, bind_group, &[]);
+        compute_pass.dispatch_workgroups(workgroups.0, workgroups.1, workgroups.2);
+    }
+}
+
+/// Helper to create a standard storage buffer binding layout entry.
+pub fn storage_buffer_binding(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry {
+    wgpu::BindGroupLayoutEntry {
+        binding,
+        visibility: wgpu::ShaderStages::COMPUTE,
+        ty: wgpu::BindingType::Buffer {
+            ty: wgpu::BufferBindingType::Storage { read_only },
+            has_dynamic_offset: false,
+            min_binding_size: None,
+        },
+        count: None,
+    }
+}
+
+/// Helper to create a uniform buffer binding layout entry.
+pub fn uniform_buffer_binding(binding: u32) -> wgpu::BindGroupLayoutEntry {
+    wgpu::BindGroupLayoutEntry {
+        binding,
+        visibility: wgpu::ShaderStages::COMPUTE,
+        ty: wgpu::BindingType::Buffer {
+            ty: wgpu::BufferBindingType::Uniform,
+            has_dynamic_offset: false,
+            min_binding_size: None,
+        },
+        count: None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_storage_buffer_binding() {
+        let entry = storage_buffer_binding(0, true);
+        assert_eq!(entry.binding, 0);
+        assert_eq!(entry.visibility, wgpu::ShaderStages::COMPUTE);
+    }
+
+    #[test]
+    fn test_uniform_buffer_binding() {
+        let entry = uniform_buffer_binding(1);
+        assert_eq!(entry.binding, 1);
+        assert_eq!(entry.visibility, wgpu::ShaderStages::COMPUTE);
+    }
+}
diff --git a/src/gpu/resident_state.rs b/src/gpu/resident_state.rs
new file mode 100644
index 000000000..bd0d1f09b
--- /dev/null
+++ b/src/gpu/resident_state.rs
@@ -0,0 +1,210 @@
+//! GPU-resident physics state manager.
+//! 
+//! This module implements the PhysX-style architecture where rigid body
+//! state lives permanently on the GPU. Only deltas (new/changed/removed
+//! bodies) are transferred, minimizing CPU↔GPU bandwidth.
+
+use crate::dynamics::{RigidBodyHandle, RigidBodySet};
+use super::buffer_manager::{BufferManager, RigidBodyGpuBuffer};
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+
+/// Tracks which bodies have been modified and need GPU updates.
+#[derive(Default)]
+struct DirtyTracker {
+    /// Bodies added since last sync
+    added: HashSet<RigidBodyHandle>,
+    /// Bodies removed since last sync
+    removed: HashSet<RigidBodyHandle>,
+    /// Bodies whose state changed (forces, velocities, etc.)
+    modified: HashSet<RigidBodyHandle>,
+}
+
+impl DirtyTracker {
+    fn clear(&mut self) {
+        self.added.clear();
+        self.removed.clear();
+        self.modified.clear();
+    }
+    
+    fn is_empty(&self) -> bool {
+        self.added.is_empty() && self.removed.is_empty() && self.modified.is_empty()
+    }
+}
+
+/// GPU-resident rigid body state with delta tracking.
+///
+/// Architecture:
+/// - Body data lives on GPU permanently
+/// - CPU tracks which bodies are dirty (added/modified/removed)
+/// - `sync_to_gpu()` uploads only deltas
+/// - `readback_for_rendering()` downloads only positions/rotations
+pub struct GpuResidentState {
+    device: Arc<wgpu::Device>,
+    queue: Arc<wgpu::Queue>,
+    buffer_manager: BufferManager,
+    
+    /// GPU buffers (persistent)
+    gpu_buffer: Option<RigidBodyGpuBuffer>,
+    
+    /// Maps RigidBodyHandle → GPU buffer index
+    handle_to_index: HashMap<RigidBodyHandle, usize>,
+    
+    /// Dirty tracking for incremental updates
+    dirty: DirtyTracker,
+    
+    /// Current capacity
+    capacity: usize,
+    
+    /// Total bodies currently on GPU
+    gpu_body_count: usize,
+}
+
+impl GpuResidentState {
+    /// Create a new GPU-resident state manager.
+    pub fn new(device: Arc<wgpu::Device>, queue: Arc<wgpu::Queue>, initial_capacity: usize) -> Self {
+        let buffer_manager = BufferManager::new(device.clone(), queue.clone());
+        let gpu_buffer = Some(buffer_manager.create_rigid_body_buffer(initial_capacity));
+        
+        Self {
+            device,
+            queue,
+            buffer_manager,
+            gpu_buffer,
+            handle_to_index: HashMap::new(),
+            dirty: DirtyTracker::default(),
+            capacity: initial_capacity,
+            gpu_body_count: 0,
+        }
+    }
+    
+    /// Mark a body as added (will be uploaded on next sync).
+    pub fn mark_added(&mut self, handle: RigidBodyHandle) {
+        self.dirty.added.insert(handle);
+    }
+    
+    /// Mark a body as modified (will be re-uploaded on next sync).
+    pub fn mark_modified(&mut self, handle: RigidBodyHandle) {
+        if !self.dirty.added.contains(&handle) {
+            self.dirty.modified.insert(handle);
+        }
+    }
+    
+    /// Mark a body as removed (will be deleted from GPU on next sync).
+    pub fn mark_removed(&mut self, handle: RigidBodyHandle) {
+        self.dirty.removed.insert(handle);
+        self.dirty.added.remove(&handle);
+        self.dirty.modified.remove(&handle);
+    }
+    
+    /// Sync dirty bodies to GPU (incremental upload).
+    ///
+    /// This is the key optimization: only upload what changed!
+    pub fn sync_to_gpu(&mut self, bodies: &RigidBodySet) {
+        if self.dirty.is_empty() {
+            return; // Nothing to do!
+        }
+        
+        // Handle removals
+        for handle in &self.dirty.removed {
+            if let Some(index) = self.handle_to_index.remove(handle) {
+                // Swap-remove: move last body to this slot
+                if index < self.gpu_body_count - 1 {
+                    // Find handle of last body and update its index
+                    let last_handle = self.handle_to_index
+                        .iter()
+                        .find(|(_, &idx)| idx == self.gpu_body_count - 1)
+                        .map(|(h, _)| *h);
+                    
+                    if let Some(last_handle) = last_handle {
+                        self.handle_to_index.insert(last_handle, index);
+                        // Mark last body as modified so it gets copied to new slot
+                        self.dirty.modified.insert(last_handle);
+                    }
+                }
+                self.gpu_body_count -= 1;
+            }
+        }
+        
+        // Handle additions
+        for handle in &self.dirty.added {
+            if self.gpu_body_count >= self.capacity {
+                // Resize GPU buffers
+                self.resize_gpu_buffers(self.capacity * 2);
+            }
+            
+            let index = self.gpu_body_count;
+            self.handle_to_index.insert(*handle, index);
+            self.gpu_body_count += 1;
+        }
+        
+        // Upload modified + added bodies
+        let mut bodies_to_upload = self.dirty.added.iter()
+            .chain(self.dirty.modified.iter())
+            .filter_map(|h| bodies.get(*h).map(|b| (*h, b)))
+            .collect::<Vec<_>>();
+        
+        if !bodies_to_upload.is_empty() {
+            // TODO: Implement partial buffer upload (currently uploads all)
+            // For now, upload entire buffer (suboptimal but works)
+            self.buffer_manager.upload_rigid_bodies(bodies, self.gpu_buffer.as_mut().unwrap());
+        }
+        
+        self.dirty.clear();
+    }
+    
+    /// Readback positions/rotations for rendering (minimal data transfer).
+    ///
+    /// Only downloads what's needed for visualization - doesn't sync full state.
+    pub fn readback_for_rendering(&self) -> (Vec<super::buffer_manager::GpuVector3>, Vec<super::buffer_manager::GpuVector3>) {
+        if let Some(ref gpu_buffer) = self.gpu_buffer {
+            self.buffer_manager.download_rigid_bodies(gpu_buffer)
+        } else {
+            (Vec::new(), Vec::new())
+        }
+    }
+    
+    /// Get GPU buffer for compute operations.
+    pub fn gpu_buffer_mut(&mut self) -> Option<&mut RigidBodyGpuBuffer> {
+        self.gpu_buffer.as_mut()
+    }
+    
+    /// Get current GPU body count.
+    pub fn body_count(&self) -> usize {
+        self.gpu_body_count
+    }
+    
+    /// Resize GPU buffers to new capacity.
+    fn resize_gpu_buffers(&mut self, new_capacity: usize) {
+        log::info!("Resizing GPU buffers: {} → {}", self.capacity, new_capacity);
+        
+        // Create new larger buffer
+        let new_buffer = self.buffer_manager.create_rigid_body_buffer(new_capacity);
+        
+        // TODO: Copy old buffer data to new buffer on GPU (avoid CPU roundtrip)
+        // For now, mark all bodies as modified to trigger re-upload
+        for handle in self.handle_to_index.keys() {
+            self.dirty.modified.insert(*handle);
+        }
+        
+        self.gpu_buffer = Some(new_buffer);
+        self.capacity = new_capacity;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    
+    #[test]
+    fn test_dirty_tracking() {
+        let mut tracker = DirtyTracker::default();
+        let handle = RigidBodyHandle::from_raw_parts(0, 0);
+        
+        tracker.added.insert(handle);
+        assert!(!tracker.is_empty());
+        
+        tracker.clear();
+        assert!(tracker.is_empty());
+    }
+}
diff --git a/src/gpu/shaders/integration.wgsl b/src/gpu/shaders/integration.wgsl
new file mode 100644
index 000000000..55c19c4b8
--- /dev/null
+++ b/src/gpu/shaders/integration.wgsl
@@ -0,0 +1,98 @@
+// Integration Kernel - Symplectic Euler Integration
+//
+// Computes next frame state for rigid bodies:
+// 1. Velocity integration: v' = v + (F/m + gravity) * dt
+// 2. Position integration: p' = p + v' * dt
+// 3. Angular integration: ω' = I⁻¹(τ * dt), rotation update
+// 4. Apply linear/angular damping
+
+struct IntegrationParams {
+    body_count: u32,
+    dt: f32,
+    gravity_x: f32,
+    gravity_y: f32,
+    gravity_z: f32,
+    linear_damping: f32,
+    angular_damping: f32,
+    _padding: f32,
+}
+
+struct GpuVector3 {
+    x: f32,
+    y: f32,
+    z: f32,
+    _padding: f32,
+}
+
+struct GpuRotation {
+    x: f32,
+    y: f32,
+    z: f32,
+    w: f32,
+}
+
+@group(0) @binding(0) var<uniform> params: IntegrationParams;
+@group(0) @binding(1) var<storage, read_write> positions: array<GpuVector3>;
+@group(0) @binding(2) var<storage, read_write> rotations: array<GpuRotation>;
+@group(0) @binding(3) var<storage, read_write> lin_velocities: array<GpuVector3>;
+@group(0) @binding(4) var<storage, read_write> ang_velocities: array<GpuVector3>;
+@group(0) @binding(5) var<storage, read> forces: array<GpuVector3>;
+@group(0) @binding(6) var<storage, read> torques: array<GpuVector3>;
+@group(0) @binding(7) var<storage, read> inv_masses: array<f32>;
+
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    let idx = global_id.x;
+    
+    if (idx >= params.body_count) {
+        return;
+    }
+    
+    let inv_mass = inv_masses[idx];
+    
+    // Skip fixed/static bodies (infinite mass = 0 inv_mass)
+    if (inv_mass == 0.0) {
+        return;
+    }
+    
+    // Load current state
+    var pos = vec3<f32>(positions[idx].x, positions[idx].y, positions[idx].z);
+    var vel = vec3<f32>(lin_velocities[idx].x, lin_velocities[idx].y, lin_velocities[idx].z);
+    var ang_vel = vec3<f32>(ang_velocities[idx].x, ang_velocities[idx].y, ang_velocities[idx].z);
+    let force = vec3<f32>(forces[idx].x, forces[idx].y, forces[idx].z);
+    let torque = vec3<f32>(torques[idx].x, torques[idx].y, torques[idx].z);
+    
+    let gravity = vec3<f32>(params.gravity_x, params.gravity_y, params.gravity_z);
+    
+    // 1. Velocity integration: v' = v + (F/m + g) * dt
+    let acceleration = force * inv_mass + gravity;
+    vel = vel + acceleration * params.dt;
+    
+    // 2. Apply linear damping: v' = v * (1 - damping)^dt
+    // Approximate: v' ≈ v * (1 - damping * dt) for small dt
+    vel = vel * (1.0 - params.linear_damping * params.dt);
+    
+    // 3. Position integration: p' = p + v * dt (symplectic Euler)
+    pos = pos + vel * params.dt;
+    
+    // 4. Angular velocity damping
+    ang_vel = ang_vel * (1.0 - params.angular_damping * params.dt);
+    
+    // 5. Rotation integration (simplified - quaternion integration would go here)
+    // For now, we'll just update angular velocity
+    // Full quaternion integration: q' = q + 0.5 * q * [0, ω] * dt
+    // TODO: Implement proper quaternion integration
+    
+    // Write back updated state
+    positions[idx].x = pos.x;
+    positions[idx].y = pos.y;
+    positions[idx].z = pos.z;
+    
+    lin_velocities[idx].x = vel.x;
+    lin_velocities[idx].y = vel.y;
+    lin_velocities[idx].z = vel.z;
+    
+    ang_velocities[idx].x = ang_vel.x;
+    ang_velocities[idx].y = ang_vel.y;
+    ang_velocities[idx].z = ang_vel.z;
+}
diff --git a/src/gpu/tests.rs b/src/gpu/tests.rs
new file mode 100644
index 000000000..746fe39f0
--- /dev/null
+++ b/src/gpu/tests.rs
@@ -0,0 +1,313 @@
+//! GPU validation tests - compare GPU results against CPU reference.
+//!
+//! This module provides a testing framework to ensure GPU-accelerated
+//! physics produces identical (or numerically equivalent) results to
+//! the CPU implementation.
+
+#[cfg(all(test, feature = "gpu-acceleration"))]
+mod tests {
+    use crate::dynamics::{RigidBodyBuilder, RigidBodySet};
+    use crate::geometry::{ColliderBuilder, ColliderSet};
+    use crate::gpu::{GpuContext, BufferManager};
+    use crate::math::{Real, Vector};
+    use approx::assert_relative_eq;
+
+    /// Tolerance for floating-point comparisons (CPU vs GPU).
+    const TOLERANCE: Real = 1e-5;
+
+    /// Helper to create a test scene with falling bodies.
+    fn create_test_scene() -> (RigidBodySet, ColliderSet) {
+        let mut bodies = RigidBodySet::new();
+        let mut colliders = ColliderSet::new();
+
+        // Create a dynamic body
+        let rb = RigidBodyBuilder::dynamic()
+            .translation(Vector::new(0.0, 10.0, 0.0))
+            .linvel(Vector::new(1.0, -2.0, 0.5))
+            .build();
+        let rb_handle = bodies.insert(rb);
+
+        // Add a collider
+        #[cfg(feature = "dim3")]
+        let collider = ColliderBuilder::ball(0.5).build();
+        #[cfg(feature = "dim2")]
+        let collider = ColliderBuilder::ball(0.5).build();
+        
+        colliders.insert_with_parent(collider, rb_handle, &mut bodies);
+
+        // Create another dynamic body
+        let rb2 = RigidBodyBuilder::dynamic()
+            .translation(Vector::new(2.0, 5.0, -1.0))
+            .linvel(Vector::new(-0.5, 1.0, 0.0))
+            .build();
+        let rb2_handle = bodies.insert(rb2);
+
+        #[cfg(feature = "dim3")]
+        let collider2 = ColliderBuilder::cuboid(0.5, 0.5, 0.5).build();
+        #[cfg(feature = "dim2")]
+        let collider2 = ColliderBuilder::cuboid(0.5, 0.5).build();
+        
+        colliders.insert_with_parent(collider2, rb2_handle, &mut bodies);
+
+        (bodies, colliders)
+    }
+
+    /// Helper to compare two rigid body sets for equality.
+    fn assert_bodies_equivalent(cpu_bodies: &RigidBodySet, gpu_bodies: &RigidBodySet) {
+        assert_eq!(
+            cpu_bodies.len(),
+            gpu_bodies.len(),
+            "Body count mismatch"
+        );
+
+        for (handle, cpu_body) in cpu_bodies.iter() {
+            let gpu_body = gpu_bodies
+                .get(handle)
+                .expect("GPU missing body that CPU has");
+
+            // Compare positions
+            let cpu_pos = &cpu_body.position().translation;
+            let gpu_pos = &gpu_body.position().translation;
+            
+            #[cfg(feature = "dim3")]
+            {
+                assert_relative_eq!(cpu_pos.x, gpu_pos.x, epsilon = TOLERANCE);
+                assert_relative_eq!(cpu_pos.y, gpu_pos.y, epsilon = TOLERANCE);
+                assert_relative_eq!(cpu_pos.z, gpu_pos.z, epsilon = TOLERANCE);
+            }
+            #[cfg(feature = "dim2")]
+            {
+                assert_relative_eq!(cpu_pos.x, gpu_pos.x, epsilon = TOLERANCE);
+                assert_relative_eq!(cpu_pos.y, gpu_pos.y, epsilon = TOLERANCE);
+            }
+
+            // Compare velocities
+            let cpu_vel = cpu_body.linvel();
+            let gpu_vel = gpu_body.linvel();
+            
+            #[cfg(feature = "dim3")]
+            {
+                assert_relative_eq!(cpu_vel.x, gpu_vel.x, epsilon = TOLERANCE);
+                assert_relative_eq!(cpu_vel.y, gpu_vel.y, epsilon = TOLERANCE);
+                assert_relative_eq!(cpu_vel.z, gpu_vel.z, epsilon = TOLERANCE);
+            }
+            #[cfg(feature = "dim2")]
+            {
+                assert_relative_eq!(cpu_vel.x, gpu_vel.x, epsilon = TOLERANCE);
+                assert_relative_eq!(cpu_vel.y, gpu_vel.y, epsilon = TOLERANCE);
+            }
+
+            // Compare angular velocity
+            #[cfg(feature = "dim3")]
+            {
+                let cpu_angvel = cpu_body.angvel();
+                let gpu_angvel = gpu_body.angvel();
+                assert_relative_eq!(cpu_angvel.x, gpu_angvel.x, epsilon = TOLERANCE);
+                assert_relative_eq!(cpu_angvel.y, gpu_angvel.y, epsilon = TOLERANCE);
+                assert_relative_eq!(cpu_angvel.z, gpu_angvel.z, epsilon = TOLERANCE);
+            }
+            #[cfg(feature = "dim2")]
+            {
+                let cpu_angvel = cpu_body.angvel();
+                let gpu_angvel = gpu_body.angvel();
+                assert_relative_eq!(*cpu_angvel, *gpu_angvel, epsilon = TOLERANCE);
+            }
+        }
+    }
+
+    #[test]
+    fn test_gpu_context_initialization() {
+        // Test that we can initialize GPU context
+        let gpu_ctx = GpuContext::new();
+        
+        if gpu_ctx.is_err() {
+            println!("Skipping GPU test - no suitable GPU found");
+            return;
+        }
+
+        let gpu_ctx = gpu_ctx.unwrap();
+        let info = gpu_ctx.adapter_info();
+        println!("GPU: {} ({:?})", info.name, info.backend);
+    }
+
+    #[test]
+    fn test_buffer_upload_download() {
+        // Test that we can upload and download data to GPU
+        let gpu_ctx = match GpuContext::new() {
+            Ok(ctx) => ctx,
+            Err(_) => {
+                println!("Skipping GPU test - no suitable GPU found");
+                return;
+            }
+        };
+
+        let (bodies, _colliders) = create_test_scene();
+        
+        let buffer_manager = BufferManager::new(
+            gpu_ctx.device,
+            gpu_ctx.queue
+        );
+
+        let mut gpu_buffer = buffer_manager.create_rigid_body_buffer(bodies.len());
+        
+        // Upload to GPU
+        buffer_manager.upload_rigid_bodies(&bodies, &mut gpu_buffer);
+        
+        println!(
+            "Uploaded {} bodies to GPU buffer (capacity: {})",
+            gpu_buffer.body_count,
+            gpu_buffer.capacity
+        );
+        
+        // TODO: Add download and comparison when we implement readback
+        assert_eq!(gpu_buffer.body_count, bodies.len());
+    }
+
+    #[test]
+    #[ignore] // Enable once integration kernel is implemented
+    fn test_integration_cpu_vs_gpu() {
+        // This test will compare CPU integration against GPU integration
+        let gpu_ctx = match GpuContext::new() {
+            Ok(ctx) => ctx,
+            Err(_) => {
+                println!("Skipping GPU test - no suitable GPU found");
+                return;
+            }
+        };
+
+        let (mut cpu_bodies, _colliders) = create_test_scene();
+        let (mut gpu_bodies, _) = create_test_scene();
+
+        let dt = 0.016; // 60 FPS
+
+        // TODO: Run CPU integration
+        // for (_handle, body) in cpu_bodies.iter_mut() {
+        //     body.integrate_forces(dt);
+        // }
+
+        // TODO: Run GPU integration
+        // let buffer_manager = BufferManager::new(
+        //     gpu_ctx.device.clone(),
+        //     gpu_ctx.queue.clone()
+        // );
+        // gpu_integrator.integrate(&mut gpu_bodies, dt);
+
+        // Compare results
+        assert_bodies_equivalent(&cpu_bodies, &gpu_bodies);
+    }
+
+    #[test]
+    #[ignore] // Enable once collision detection is implemented
+    fn test_collision_detection_cpu_vs_gpu() {
+        let gpu_ctx = match GpuContext::new() {
+            Ok(ctx) => ctx,
+            Err(_) => {
+                println!("Skipping GPU test - no suitable GPU found");
+                return;
+            }
+        };
+
+        // TODO: Create scene with colliding bodies
+        // TODO: Run CPU collision detection
+        // TODO: Run GPU collision detection
+        // TODO: Compare contact manifolds
+        
+        println!("GPU collision detection test (not yet implemented)");
+    }
+
+    #[test]
+    #[ignore] // Enable once constraint solver is implemented
+    fn test_solver_cpu_vs_gpu() {
+        let gpu_ctx = match GpuContext::new() {
+            Ok(ctx) => ctx,
+            Err(_) => {
+                println!("Skipping GPU test - no suitable GPU found");
+                return;
+            }
+        };
+
+        // TODO: Create scene with constraints
+        // TODO: Run CPU solver
+        // TODO: Run GPU solver
+        // TODO: Compare constraint impulses and body velocities
+        
+        println!("GPU solver test (not yet implemented)");
+    }
+
+    #[test]
+    #[ignore] // Enable for long-running stability tests
+    fn test_long_simulation_cpu_vs_gpu() {
+        // Run a 10-second simulation and ensure CPU and GPU stay in sync
+        let gpu_ctx = match GpuContext::new() {
+            Ok(ctx) => ctx,
+            Err(_) => {
+                println!("Skipping GPU test - no suitable GPU found");
+                return;
+            }
+        };
+
+        let (mut cpu_bodies, cpu_colliders) = create_test_scene();
+        let (mut gpu_bodies, gpu_colliders) = create_test_scene();
+
+        let dt = 0.016; // 60 FPS
+        let steps = 600; // 10 seconds
+
+        for step in 0..steps {
+            // TODO: Step CPU physics
+            // TODO: Step GPU physics
+            
+            // Compare every 60 frames (1 second)
+            if step % 60 == 0 {
+                println!("Comparing at step {}", step);
+                assert_bodies_equivalent(&cpu_bodies, &gpu_bodies);
+            }
+        }
+
+        // Final comparison
+        assert_bodies_equivalent(&cpu_bodies, &gpu_bodies);
+    }
+
+    #[test]
+    fn test_stress_test_many_bodies() {
+        // Test with 1000+ bodies to ensure GPU scales
+        let gpu_ctx = match GpuContext::new() {
+            Ok(ctx) => ctx,
+            Err(_) => {
+                println!("Skipping GPU test - no suitable GPU found");
+                return;
+            }
+        };
+
+        let mut bodies = RigidBodySet::new();
+        
+        // Create 1000 bodies in a grid
+        for i in 0..10 {
+            for j in 0..10 {
+                for k in 0..10 {
+                    #[cfg(feature = "dim3")]
+                    let pos = Vector::new(i as Real * 2.0, j as Real * 2.0, k as Real * 2.0);
+                    #[cfg(feature = "dim2")]
+                    let pos = Vector::new(i as Real * 2.0, j as Real * 2.0);
+                    
+                    let rb = RigidBodyBuilder::dynamic()
+                        .translation(pos)
+                        .build();
+                    bodies.insert(rb);
+                }
+            }
+        }
+
+        println!("Created {} bodies", bodies.len());
+
+        let buffer_manager = BufferManager::new(
+            gpu_ctx.device,
+            gpu_ctx.queue
+        );
+
+        let mut gpu_buffer = buffer_manager.create_rigid_body_buffer(bodies.len());
+        buffer_manager.upload_rigid_bodies(&bodies, &mut gpu_buffer);
+
+        assert_eq!(gpu_buffer.body_count, bodies.len());
+        println!("Successfully uploaded {} bodies to GPU", gpu_buffer.body_count);
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 8b3366635..ff9e3dedb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -169,6 +169,9 @@ pub mod geometry;
 pub mod pipeline;
 pub mod utils;
 
+#[cfg(feature = "gpu-acceleration")]
+pub mod gpu;
+
 /// Elementary mathematical entities (vectors, matrices, isometries, etc).
 pub mod math {
     pub use parry::math::*;