diff --git a/crates/rapier2d-f64/Cargo.toml b/crates/rapier2d-f64/Cargo.toml index 099323811..0ce017bab 100644 --- a/crates/rapier2d-f64/Cargo.toml +++ b/crates/rapier2d-f64/Cargo.toml @@ -49,6 +49,7 @@ serde-serialize = [ enhanced-determinism = ["simba/libm_force", "parry2d-f64/enhanced-determinism"] debug-render = [] profiler = ["dep:web-time"] # Enables the internal profiler. +gpu-acceleration = ["dep:wgpu", "dep:bytemuck", "dep:pollster"] # Enables GPU compute via WGPU # Feature used for debugging only. debug-disable-legitimate-fe-exceptions = [] @@ -89,6 +90,11 @@ thiserror = "2" profiling = "1.0" static_assertions = "1" +# GPU acceleration dependencies +wgpu = { version = "22", optional = true } +bytemuck = { version = "1.14", optional = true, features = ["derive"] } +pollster = { version = "0.3", optional = true } + [dev-dependencies] bincode = "1" serde_json = "1" diff --git a/crates/rapier2d/Cargo.toml b/crates/rapier2d/Cargo.toml index 75122662d..6578747ab 100644 --- a/crates/rapier2d/Cargo.toml +++ b/crates/rapier2d/Cargo.toml @@ -49,6 +49,7 @@ serde-serialize = [ enhanced-determinism = ["simba/libm_force", "parry2d/enhanced-determinism"] debug-render = [] profiler = ["dep:web-time"] # Enables the internal profiler. +gpu-acceleration = ["dep:wgpu", "dep:bytemuck", "dep:pollster"] # Enables GPU compute via WGPU # Feature used for debugging only. debug-disable-legitimate-fe-exceptions = [] @@ -92,8 +93,19 @@ static_assertions = "1" # TODO: should be re-exported from simba wide = "0.7.1" +# GPU acceleration dependencies +wgpu = { version = "22", optional = true } +bytemuck = { version = "1.14", optional = true, features = ["derive"] } +pollster = { version = "0.3", optional = true } + [dev-dependencies] bincode = "1" serde_json = "1" serde = { version = "1", features = ["derive"] } oorandom = { version = "11", default-features = false } +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "gpu_benchmarks" +harness = false +required-features = ["gpu-acceleration"] diff --git a/crates/rapier2d/benches/gpu_benchmarks.rs b/crates/rapier2d/benches/gpu_benchmarks.rs new file mode 100644 index 000000000..e69de29bb diff --git a/crates/rapier3d-f64/Cargo.toml b/crates/rapier3d-f64/Cargo.toml index 7c295e121..d35962561 100644 --- a/crates/rapier3d-f64/Cargo.toml +++ b/crates/rapier3d-f64/Cargo.toml @@ -52,6 +52,7 @@ serde-serialize = [ enhanced-determinism = ["simba/libm_force", "parry3d-f64/enhanced-determinism"] debug-render = [] profiler = ["dep:web-time"] # Enables the internal profiler. +gpu-acceleration = ["dep:wgpu", "dep:bytemuck", "dep:pollster"] # Enables GPU compute via WGPU # Feature used for debugging only. debug-disable-legitimate-fe-exceptions = [] @@ -92,6 +93,11 @@ thiserror = "2" profiling = "1.0" static_assertions = "1" +# GPU acceleration dependencies +wgpu = { version = "22", optional = true } +bytemuck = { version = "1.14", optional = true, features = ["derive"] } +pollster = { version = "0.3", optional = true } + [dev-dependencies] bincode = "1" serde_json = "1" diff --git a/crates/rapier3d/Cargo.toml b/crates/rapier3d/Cargo.toml index 82470777d..cc60ad11c 100644 --- a/crates/rapier3d/Cargo.toml +++ b/crates/rapier3d/Cargo.toml @@ -53,6 +53,7 @@ serde-serialize = [ enhanced-determinism = ["simba/libm_force", "parry3d/enhanced-determinism"] debug-render = [] profiler = ["dep:web-time"] # Enables the internal profiler. +gpu-acceleration = ["dep:wgpu", "dep:bytemuck", "dep:pollster"] # Enables GPU compute via WGPU # Feature used for debugging only. debug-disable-legitimate-fe-exceptions = [] @@ -96,8 +97,19 @@ static_assertions = "1" # TODO: should be re-exported from simba wide = "0.7.1" +# GPU acceleration dependencies +wgpu = { version = "22", optional = true } +bytemuck = { version = "1.14", optional = true, features = ["derive"] } +pollster = { version = "0.3", optional = true } + [dev-dependencies] bincode = "1" serde_json = "1" serde = { version = "1", features = ["derive"] } oorandom = { version = "11", default-features = false } +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "gpu_benchmarks" +harness = true +required-features = ["gpu-acceleration"] diff --git a/crates/rapier3d/benches/gpu_benchmarks.rs b/crates/rapier3d/benches/gpu_benchmarks.rs new file mode 100644 index 000000000..e9d22836b --- /dev/null +++ b/crates/rapier3d/benches/gpu_benchmarks.rs @@ -0,0 +1,195 @@ +//! GPU vs CPU benchmarks: Direct comparison with summary table. +//! +//! Compares CPU and GPU performance for physics operations +//! and prints an easy-to-read comparison table. + +use rapier3d::prelude::*; +use rapier3d::gpu::{GpuContext, BufferManager}; +use std::time::Instant; + +/// Test scales: body counts +const SCALES: &[usize] = &[10, 50, 100, 500, 1_000, 5_000, 10_000]; +const ITERATIONS: usize = 10; // Number of iterations to average + +struct BenchmarkResult { + scale: usize, + cpu_time_us: f64, + gpu_time_us: f64, + speedup: f64, +} + +/// Helper to create a scene with N bodies in a grid. +fn create_test_bodies(count: usize) -> RigidBodySet { + let mut bodies = RigidBodySet::new(); + let side = (count as f32).cbrt().ceil() as usize; + let spacing = 2.5; + + for i in 0..side { + for j in 0..side { + for k in 0..side { + if bodies.len() >= count { + break; + } + + let pos = Vector::new( + i as Real * spacing, + j as Real * spacing, + k as Real * spacing, + ); + + let rb = RigidBodyBuilder::dynamic() + .translation(pos) + .linvel(Vector::new( + (i as Real - side as Real / 2.0) * 0.1, + -1.0, + (k as Real - side as Real / 2.0) * 0.1, + )) + .build(); + + bodies.insert(rb); + } + if bodies.len() >= count { + break; + } + } + if bodies.len() >= count { + break; + } + } + + bodies +} + +fn benchmark_cpu_integration(bodies: &RigidBodySet, iterations: usize) -> f64 { + let dt = 1.0 / 60.0; + let mut bodies_copy = bodies.clone(); + + let start = Instant::now(); + for _ in 0..iterations { + for (_handle, body) in bodies_copy.iter_mut() { + if body.is_dynamic() { + let mut pos = body.position().clone(); + let linvel = body.linvel(); + + pos.translation.x += linvel.x * dt; + pos.translation.y += linvel.y * dt; + pos.translation.z += linvel.z * dt; + + body.set_position(pos, false); + } + } + } + let elapsed = start.elapsed(); + + elapsed.as_micros() as f64 / iterations as f64 +} + +fn benchmark_gpu_transfer(bodies: &RigidBodySet, buffer_manager: &BufferManager, gpu_buffer: &mut rapier3d::gpu::RigidBodyGpuBuffer, iterations: usize) -> f64 { + let start = Instant::now(); + for _ in 0..iterations { + buffer_manager.upload_rigid_bodies(bodies, gpu_buffer); + let (_positions, _velocities) = buffer_manager.download_rigid_bodies(gpu_buffer); + } + let elapsed = start.elapsed(); + + elapsed.as_micros() as f64 / iterations as f64 +} + +fn print_results_table(results: &[BenchmarkResult]) { + println!("\n╔═══════════════════════════════════════════════════════════════════════════════╗"); + println!("║ CPU vs GPU PERFORMANCE COMPARISON ║"); + println!("╠═══════════════════════════════════════════════════════════════════════════════╣"); + println!("║ Bodies │ CPU Time │ GPU Time │ Speedup │ Winner ║"); + println!("╠══════════╪════════════════╪════════════════╪═════════════╪═══════════════════╣"); + + for result in results { + let winner = if result.speedup > 1.0 { + format!("GPU {:>6.2}x faster", result.speedup) + } else if result.speedup < 0.95 { + format!("CPU {:>6.2}x faster", 1.0 / result.speedup) + } else { + " ~Same speed ".to_string() + }; + + let cpu_str = format_time(result.cpu_time_us); + let gpu_str = format_time(result.gpu_time_us); + + println!("║ {:>8} │ {:>14} │ {:>14} │ {:>11.2}x │ {:>17} ║", + result.scale, + cpu_str, + gpu_str, + result.speedup, + winner); + } + + println!("╚══════════╧════════════════╧════════════════╧═════════════╧═══════════════════╝"); + println!("\nNote: GPU times include CPU↔GPU transfer overhead."); + println!(" Actual GPU compute will be added in Phase 2."); +} + +fn format_time(us: f64) -> String { + if us < 1_000.0 { + format!("{:>10.2} µs", us) + } else if us < 1_000_000.0 { + format!("{:>10.2} ms", us / 1_000.0) + } else { + format!("{:>10.2} s", us / 1_000_000.0) + } +} + +fn main() { + println!("╔═══════════════════════════════════════════════════════════════════════════════╗"); + println!("║ RAPIER GPU ACCELERATION BENCHMARK ║"); + println!("╚═══════════════════════════════════════════════════════════════════════════════╝\n"); + + println!("Initializing GPU..."); + let gpu_setup = match GpuContext::new() { + Ok(ctx) => { + println!("✓ GPU initialized: {}", ctx.adapter.get_info().name); + let buffer_manager = BufferManager::new(ctx.device, ctx.queue); + Some(buffer_manager) + }, + Err(e) => { + println!("✗ GPU not available: {:?}", e); + println!(" Running CPU-only benchmarks...\n"); + None + } + }; + + let mut results = Vec::new(); + + for &scale in SCALES { + print!("Benchmarking {} bodies... ", scale); + std::io::Write::flush(&mut std::io::stdout()).unwrap(); + + let bodies = create_test_bodies(scale); + + // Benchmark CPU + let cpu_time = benchmark_cpu_integration(&bodies, ITERATIONS); + + // Benchmark GPU if available + let gpu_time = if let Some(ref buffer_manager) = gpu_setup { + let mut gpu_buffer = buffer_manager.create_rigid_body_buffer(bodies.len()); + benchmark_gpu_transfer(&bodies, buffer_manager, &mut gpu_buffer, ITERATIONS) + } else { + 0.0 + }; + + let speedup = if gpu_time > 0.0 { + cpu_time / gpu_time + } else { + 0.0 + }; + + results.push(BenchmarkResult { + scale, + cpu_time_us: cpu_time, + gpu_time_us: gpu_time, + speedup, + }); + + println!("Done!"); + } + + print_results_table(&results); +} diff --git a/examples3d/Cargo.toml b/examples3d/Cargo.toml index 1230a9e83..e17fd2a60 100644 --- a/examples3d/Cargo.toml +++ b/examples3d/Cargo.toml @@ -43,6 +43,10 @@ path = "./all_examples3.rs" name = "harness_capsules3" path = "./harness_capsules3.rs" +[[bin]] +name = "gpu_benchmark" +path = "./gpu_benchmark.rs" + #[lib] #crate-type = ["cdylib", "rlib"] #path = "./all_examples3_wasm.rs" diff --git a/examples3d/gpu_benchmark.rs b/examples3d/gpu_benchmark.rs new file mode 100644 index 000000000..930b5ad78 --- /dev/null +++ b/examples3d/gpu_benchmark.rs @@ -0,0 +1,216 @@ +//! GPU vs CPU benchmarks: Direct comparison with summary table. +//! +//! Compares CPU and GPU performance for physics operations +//! and prints an easy-to-read comparison table. + +use rapier3d::prelude::*; +use rapier3d::gpu::{GpuContext, BufferManager, GpuIntegrator}; +use std::time::Instant; + use rapier3d::gpu::wgpu; + +/// Test scales: body counts +const SCALES: &[usize] = &[10, 50, 100, 500, 1_000, 5_000, 10_000, 50_000, 100_000, 500_000, 1_000_000]; +const ITERATIONS: usize = 10; // Number of iterations to average + +struct BenchmarkResult { + scale: usize, + cpu_time_us: f64, + gpu_time_us: f64, + speedup: f64, +} + +/// Helper to create a scene with N bodies in a grid. +fn create_test_bodies(count: usize) -> RigidBodySet { + let mut bodies = RigidBodySet::new(); + let side = (count as f32).cbrt().ceil() as usize; + let spacing = 2.5; + + for i in 0..side { + for j in 0..side { + for k in 0..side { + if bodies.len() >= count { + break; + } + + let pos = Vector::new( + i as Real * spacing, + j as Real * spacing, + k as Real * spacing, + ); + + let rb = RigidBodyBuilder::dynamic() + .translation(pos) + .linvel(Vector::new( + (i as Real - side as Real / 2.0) * 0.1, + -1.0, + (k as Real - side as Real / 2.0) * 0.1, + )) + .build(); + + bodies.insert(rb); + } + if bodies.len() >= count { + break; + } + } + if bodies.len() >= count { + break; + } + } + + bodies +} + +fn benchmark_cpu_integration(bodies: &RigidBodySet, iterations: usize) -> f64 { + let dt = 1.0 / 60.0; + let mut bodies_copy = bodies.clone(); + + let start = Instant::now(); + for _ in 0..iterations { + for (_handle, body) in bodies_copy.iter_mut() { + if body.is_dynamic() { + let mut pos = body.position().clone(); + let linvel = body.linvel(); + + pos.translation.x += linvel.x * dt; + pos.translation.y += linvel.y * dt; + pos.translation.z += linvel.z * dt; + + body.set_position(pos, false); + } + } + } + let elapsed = start.elapsed(); + + elapsed.as_micros() as f64 / iterations as f64 +} + +fn benchmark_gpu_integration( + bodies: &RigidBodySet, + buffer_manager: &BufferManager, + integrator: &GpuIntegrator, + device: &wgpu::Device, + queue: &wgpu::Queue, + gpu_buffer: &mut rapier3d::gpu::RigidBodyGpuBuffer, + iterations: usize +) -> f64 { + let dt = 1.0 / 60.0; + let gravity = [0.0, -9.81, 0.0]; + + let start = Instant::now(); + for _ in 0..iterations { + // Upload data to GPU + buffer_manager.upload_rigid_bodies(bodies, gpu_buffer); + + // Run GPU integration compute shader + integrator.integrate(device, queue, gpu_buffer, dt, gravity, 0.0, 0.0); + + // Download results from GPU + let (_positions, _velocities) = buffer_manager.download_rigid_bodies(gpu_buffer); + } + let elapsed = start.elapsed(); + + elapsed.as_micros() as f64 / iterations as f64 +} + +fn print_results_table(results: &[BenchmarkResult]) { + println!("\n╔═══════════════════════════════════════════════════════════════════════════════╗"); + println!("║ CPU vs GPU PERFORMANCE COMPARISON ║"); + println!("╠═══════════════════════════════════════════════════════════════════════════════╣"); + println!("║ Bodies │ CPU Time │ GPU Time │ Speedup │ Winner ║"); + println!("╠══════════╪════════════════╪════════════════╪═════════════╪═══════════════════╣"); + + for result in results { + let winner = if result.speedup > 1.0 { + format!("GPU {:>6.2}x faster", result.speedup) + } else if result.speedup < 0.95 { + format!("CPU {:>6.2}x faster", 1.0 / result.speedup) + } else { + " ~Same speed ".to_string() + }; + + let cpu_str = format_time(result.cpu_time_us); + let gpu_str = format_time(result.gpu_time_us); + + println!("║ {:>8} │ {:>14} │ {:>14} │ {:>11.2}x │ {:>17} ║", + result.scale, + cpu_str, + gpu_str, + result.speedup, + winner); + } + + println!("╚══════════╧════════════════╧════════════════╧═════════════╧═══════════════════╝"); + println!("\nNote: GPU times now include ACTUAL GPU COMPUTE (integration kernel)!"); + println!(" Transfer overhead + compute shader execution on RTX 4090."); +} + +fn format_time(us: f64) -> String { + if us < 1_000.0 { + format!("{:>10.2} µs", us) + } else if us < 1_000_000.0 { + format!("{:>10.2} ms", us / 1_000.0) + } else { + format!("{:>10.2} s", us / 1_000_000.0) + } +} + +fn main() { + println!("╔═══════════════════════════════════════════════════════════════════════════════╗"); + println!("║ RAPIER GPU ACCELERATION BENCHMARK ║"); + println!("╚═══════════════════════════════════════════════════════════════════════════════╝\n"); + + println!("Initializing GPU..."); + let gpu_ctx = match GpuContext::new() { + Ok(ctx) => { + println!("✓ GPU initialized: {}", ctx.adapter.get_info().name); + Some(ctx) + }, + Err(e) => { + println!("✗ GPU not available: {:?}", e); + println!(" Running CPU-only benchmarks...\n"); + None + } + }; + + let mut results = Vec::new(); + + for &scale in SCALES { + print!("Benchmarking {} bodies... ", scale); + std::io::Write::flush(&mut std::io::stdout()).unwrap(); + + let bodies = create_test_bodies(scale); + + // Benchmark CPU + let cpu_time = benchmark_cpu_integration(&bodies, ITERATIONS); + + // Benchmark GPU if available + let gpu_time = if let Some(ref ctx) = gpu_ctx { + // Create buffer manager and integrator with the same device + let buffer_manager = BufferManager::new(ctx.device.clone(), ctx.queue.clone()); + let integrator = GpuIntegrator::new(&ctx.device); + + let mut gpu_buffer = buffer_manager.create_rigid_body_buffer(bodies.len()); + benchmark_gpu_integration(&bodies, &buffer_manager, &integrator, &ctx.device, &ctx.queue, &mut gpu_buffer, ITERATIONS) + } else { + 0.0 + }; + + let speedup = if gpu_time > 0.0 { + cpu_time / gpu_time + } else { + 0.0 + }; + + results.push(BenchmarkResult { + scale, + cpu_time_us: cpu_time, + gpu_time_us: gpu_time, + speedup, + }); + + println!("Done!"); + } + + print_results_table(&results); +} diff --git a/phase2_results.txt b/phase2_results.txt new file mode 100644 index 000000000..cc395ae81 --- /dev/null +++ b/phase2_results.txt @@ -0,0 +1,136 @@ +warning: unused imports: `BufferManager` and `GpuContext` + --> crates\rapier3d\..\..\src\gpu\integrator.rs:8:18 + | +8 | use crate::gpu::{GpuContext, BufferManager, RigidBodyGpuBuffer}; + | ^^^^^^^^^^ ^^^^^^^^^^^^^ + | + = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default + +warning: unused import: `crate::dynamics::RigidBodySet` + --> crates\rapier3d\..\..\src\gpu\integrator.rs:9:5 + | +9 | use crate::dynamics::RigidBodySet; + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: associated function `vector_to_gpu` is never used + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:353:8 + | + 82 | impl BufferManager { + | ------------------ associated function in this implementation +... +353 | fn vector_to_gpu(v: &na::Vector3) -> GpuVector3 { + | ^^^^^^^^^^^^^ + | + = note: `#[warn(dead_code)]` (part of `#[warn(unused)]`) on by default + +warning: function `storage_buffer_binding` is never used + --> crates\rapier3d\..\..\src\gpu\pipeline.rs:111:8 + | +111 | pub fn storage_buffer_binding(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry { + | ^^^^^^^^^^^^^^^^^^^^^^ + +warning: function `uniform_buffer_binding` is never used + --> crates\rapier3d\..\..\src\gpu\pipeline.rs:125:8 + | +125 | pub fn uniform_buffer_binding(binding: u32) -> wgpu::BindGroupLayoutEntry { + | ^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\device.rs:12:5 + | +12 | pub device: wgpu::Device, + | ^^^^^^^^^^^^^^^^^^^^^^^^ + | +note: the lint level is defined here + --> crates\rapier3d\..\..\src\lib.rs:14:9 + | +14 | #![warn(missing_docs)] + | ^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\device.rs:13:5 + | +13 | pub queue: wgpu::Queue, + | ^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\device.rs:14:5 + | +14 | pub adapter: wgpu::Adapter, + | ^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:20:5 + | +20 | pub positions_buffer: wgpu::Buffer, + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:21:5 + | +21 | pub rotations_buffer: wgpu::Buffer, + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:24:5 + | +24 | pub lin_velocities_buffer: wgpu::Buffer, + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:25:5 + | +25 | pub ang_velocities_buffer: wgpu::Buffer, + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:28:5 + | +28 | pub forces_buffer: wgpu::Buffer, + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:29:5 + | +29 | pub torques_buffer: wgpu::Buffer, + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:32:5 + | +32 | pub inv_masses_buffer: wgpu::Buffer, + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:33:5 + | +33 | pub inv_inertias_buffer: wgpu::Buffer, + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:36:5 + | +36 | pub body_count: usize, + | ^^^^^^^^^^^^^^^^^^^^^ + +warning: missing documentation for a struct field + --> crates\rapier3d\..\..\src\gpu\buffer_manager.rs:37:5 + | +37 | pub capacity: usize, + | ^^^^^^^^^^^^^^^^^^^ + +warning: `rapier3d` (lib) generated 18 warnings (run `cargo fix --lib -p rapier3d` to apply 2 suggestions) + Finished `release` profile [optimized] target(s) in 0.38s + Running `target\release\gpu_benchmark.exe` +ΓòöΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòù +Γòæ RAPIER GPU ACCELERATION BENCHMARK Γòæ +ΓòÜΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓòÉΓò¥ + +Initializing GPU... +Γ£ô GPU initialized: NVIDIA GeForce RTX 4090 + +thread 'main' (27040) panicked at C:\Users\redst\.cargo\registry\src\index.crates.io-1949cf8c6b5b557f\wgpu-core-22.1.0\src\storage.rs:79:46: +Buffer[Id(1,1,vk)] does not exist +note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace +error: process didn't exit successfully: `target\release\gpu_benchmark.exe` (exit code: 101) +Benchmarking 10 bodies... diff --git a/src/gpu/buffer_manager.rs b/src/gpu/buffer_manager.rs new file mode 100644 index 000000000..68fbbda6e --- /dev/null +++ b/src/gpu/buffer_manager.rs @@ -0,0 +1,400 @@ +//! GPU buffer management for physics data. + +use crate::dynamics::RigidBodySet; +use crate::math::Real; + +#[cfg(feature = "dim3")] +use crate::glamx::{Vec3, Quat}; +#[cfg(feature = "dim2")] +use crate::glamx::Vec2; + +use wgpu; +use bytemuck::{Pod, Zeroable}; + +/// GPU-friendly representation of rigid body data using Structure-of-Arrays layout. +/// +/// This layout optimizes for coalesced GPU memory access by grouping similar +/// data types together rather than per-body structures. +pub struct RigidBodyGpuBuffer { + // Position data + pub positions_buffer: wgpu::Buffer, + pub rotations_buffer: wgpu::Buffer, + + // Velocity data + pub lin_velocities_buffer: wgpu::Buffer, + pub ang_velocities_buffer: wgpu::Buffer, + + // Force accumulation + pub forces_buffer: wgpu::Buffer, + pub torques_buffer: wgpu::Buffer, + + // Mass properties + pub inv_masses_buffer: wgpu::Buffer, + pub inv_inertias_buffer: wgpu::Buffer, + + // Metadata + pub body_count: usize, + pub capacity: usize, +} + +/// Position data for GPU (3D vector). +#[repr(C)] +#[derive(Copy, Clone, Debug, Pod, Zeroable)] +pub struct GpuVector3 { + pub x: f32, + pub y: f32, + pub z: f32, + pub _padding: f32, // Align to 16 bytes +} + +/// Rotation data for GPU (quaternion in 3D, angle in 2D). +#[repr(C)] +#[derive(Copy, Clone, Debug, Pod, Zeroable)] +pub struct GpuRotation { + #[cfg(feature = "dim3")] + pub x: f32, + #[cfg(feature = "dim3")] + pub y: f32, + #[cfg(feature = "dim3")] + pub z: f32, + #[cfg(feature = "dim3")] + pub w: f32, + + #[cfg(feature = "dim2")] + pub angle: f32, + #[cfg(feature = "dim2")] + pub _padding: [f32; 3], +} + +/// 3x3 matrix for inertia tensors (stored as row-major). +#[repr(C)] +#[derive(Copy, Clone, Debug, Pod, Zeroable)] +pub struct GpuMatrix3 { + pub data: [f32; 12], // 3x3 matrix + padding to 4x3 +} + +/// Manages GPU buffer lifecycle and CPU↔GPU transfers. +pub struct BufferManager { + device: std::sync::Arc, + queue: std::sync::Arc, +} + +impl BufferManager { + /// Creates a new buffer manager. + pub fn new(device: std::sync::Arc, queue: std::sync::Arc) -> Self { + Self { device, queue } + } + + /// Creates GPU buffers for rigid body data. + /// + /// # Arguments + /// + /// * `capacity` - Maximum number of bodies to allocate space for + pub fn create_rigid_body_buffer(&self, capacity: usize) -> RigidBodyGpuBuffer { + let create_buffer = |label: &str, size: usize| { + self.device.create_buffer(&wgpu::BufferDescriptor { + label: Some(label), + size: size as u64, + usage: wgpu::BufferUsages::STORAGE + | wgpu::BufferUsages::COPY_DST + | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + }) + }; + + let vec3_size = std::mem::size_of::() * capacity; + let rot_size = std::mem::size_of::() * capacity; + let scalar_size = std::mem::size_of::() * capacity; + let mat3_size = std::mem::size_of::() * capacity; + + RigidBodyGpuBuffer { + positions_buffer: create_buffer("RigidBody Positions", vec3_size), + rotations_buffer: create_buffer("RigidBody Rotations", rot_size), + lin_velocities_buffer: create_buffer("RigidBody Linear Velocities", vec3_size), + ang_velocities_buffer: create_buffer("RigidBody Angular Velocities", vec3_size), + forces_buffer: create_buffer("RigidBody Forces", vec3_size), + torques_buffer: create_buffer("RigidBody Torques", vec3_size), + inv_masses_buffer: create_buffer("RigidBody Inverse Masses", scalar_size), + inv_inertias_buffer: create_buffer("RigidBody Inverse Inertias", mat3_size), + body_count: 0, + capacity, + } + } + + /// Uploads rigid body data from CPU to GPU. + /// + /// # Arguments + /// + /// * `bodies` - The rigid body set to upload + /// * `gpu_buffer` - The GPU buffer to write to + pub fn upload_rigid_bodies( + &self, + bodies: &RigidBodySet, + gpu_buffer: &mut RigidBodyGpuBuffer, + ) { + let body_count = bodies.len(); + + if body_count > gpu_buffer.capacity { + log::warn!( + "Body count ({}) exceeds GPU buffer capacity ({}). Truncating.", + body_count, + gpu_buffer.capacity + ); + } + + let count = body_count.min(gpu_buffer.capacity); + gpu_buffer.body_count = count; + + // Prepare CPU-side data in SoA layout + let mut positions = Vec::with_capacity(count); + let mut rotations = Vec::with_capacity(count); + let mut lin_vels = Vec::with_capacity(count); + let mut ang_vels = Vec::with_capacity(count); + let mut forces = Vec::with_capacity(count); + let mut torques = Vec::with_capacity(count); + let mut inv_masses = Vec::with_capacity(count); + let mut inv_inertias = Vec::with_capacity(count); + + for (_handle, body) in bodies.iter().take(count) { + let pos = &body.position().translation; + #[cfg(feature = "dim3")] + positions.push(Self::vector_to_gpu_from_vec3(pos)); + #[cfg(feature = "dim2")] + positions.push(Self::vector_to_gpu_from_vec2(pos)); + + #[cfg(feature = "dim3")] + rotations.push(Self::rotation_to_gpu(&body.position().rotation)); + #[cfg(feature = "dim2")] + rotations.push(Self::rotation_to_gpu(&body.position().rotation)); + + let vel = body.linvel(); + #[cfg(feature = "dim3")] + lin_vels.push(Self::vector_to_gpu_from_vec3(&vel)); + #[cfg(feature = "dim2")] + lin_vels.push(Self::vector_to_gpu_from_vec2(&vel)); + + #[cfg(feature = "dim3")] + { + let angvel = body.angvel(); + ang_vels.push(GpuVector3 { + x: angvel.x, + y: angvel.y, + z: angvel.z, + _padding: 0.0, + }); + } + #[cfg(feature = "dim2")] + { + let angvel = body.angvel(); + ang_vels.push(GpuVector3 { + x: 0.0, + y: 0.0, + z: angvel, + _padding: 0.0, + }); + } + + // User forces + let force = body.user_force(); + #[cfg(feature = "dim3")] + forces.push(Self::vector_to_gpu_from_vec3(&force)); + #[cfg(feature = "dim2")] + forces.push(Self::vector_to_gpu_from_vec2(&force)); + + #[cfg(feature = "dim3")] + { + let torque = body.user_torque(); + torques.push(GpuVector3 { + x: torque.x, + y: torque.y, + z: torque.z, + _padding: 0.0, + }); + } + #[cfg(feature = "dim2")] + { + let torque = body.user_torque(); + torques.push(GpuVector3 { + x: 0.0, + y: 0.0, + z: torque, + _padding: 0.0, + }); + } + + let inv_mass = body.mass_properties().local_mprops.inv_mass; + inv_masses.push(inv_mass); + + #[cfg(feature = "dim3")] + { + // SdpMatrix3 is symmetric, extract as Matrix3 + let inv_inertia = body.mass_properties().effective_world_inv_inertia; + let mut data = [0.0f32; 12]; + // SdpMatrix3 stores only 6 unique values (symmetric) + data[0] = inv_inertia.m11; + data[1] = inv_inertia.m12; + data[2] = inv_inertia.m13; + data[4] = inv_inertia.m12; // symmetric + data[5] = inv_inertia.m22; + data[6] = inv_inertia.m23; + data[8] = inv_inertia.m13; // symmetric + data[9] = inv_inertia.m23; // symmetric + data[10] = inv_inertia.m33; + inv_inertias.push(GpuMatrix3 { data }); + } + #[cfg(feature = "dim2")] + { + let inv_inertia = body.mass_properties().effective_world_inv_inertia; + inv_inertias.push(GpuMatrix3 { + data: [inv_inertia, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + }); + } + } + + // Upload to GPU + self.queue.write_buffer(&gpu_buffer.positions_buffer, 0, bytemuck::cast_slice(&positions)); + self.queue.write_buffer(&gpu_buffer.rotations_buffer, 0, bytemuck::cast_slice(&rotations)); + self.queue.write_buffer(&gpu_buffer.lin_velocities_buffer, 0, bytemuck::cast_slice(&lin_vels)); + self.queue.write_buffer(&gpu_buffer.ang_velocities_buffer, 0, bytemuck::cast_slice(&ang_vels)); + self.queue.write_buffer(&gpu_buffer.forces_buffer, 0, bytemuck::cast_slice(&forces)); + self.queue.write_buffer(&gpu_buffer.torques_buffer, 0, bytemuck::cast_slice(&torques)); + self.queue.write_buffer(&gpu_buffer.inv_masses_buffer, 0, bytemuck::cast_slice(&inv_masses)); + self.queue.write_buffer(&gpu_buffer.inv_inertias_buffer, 0, bytemuck::cast_slice(&inv_inertias)); + } + + /// Download rigid body data from GPU back to CPU. + /// Returns vectors of positions and velocities in SoA layout. + pub fn download_rigid_bodies(&self, gpu_buffer: &RigidBodyGpuBuffer) -> (Vec, Vec) { + // Create staging buffers for readback + let positions_staging = self.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("Positions Staging Buffer"), + size: (gpu_buffer.body_count * std::mem::size_of::()) as u64, + usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + + let velocities_staging = self.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("Velocities Staging Buffer"), + size: (gpu_buffer.body_count * std::mem::size_of::()) as u64, + usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + + // Create command encoder and copy data + let mut encoder = self.device.create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: Some("Download Encoder"), + }); + + encoder.copy_buffer_to_buffer( + &gpu_buffer.positions_buffer, + 0, + &positions_staging, + 0, + (gpu_buffer.body_count * std::mem::size_of::()) as u64, + ); + + encoder.copy_buffer_to_buffer( + &gpu_buffer.lin_velocities_buffer, + 0, + &velocities_staging, + 0, + (gpu_buffer.body_count * std::mem::size_of::()) as u64, + ); + + self.queue.submit(Some(encoder.finish())); + + // Map and read the staging buffers + let positions_slice = positions_staging.slice(..); + let velocities_slice = velocities_staging.slice(..); + + positions_slice.map_async(wgpu::MapMode::Read, |_| {}); + velocities_slice.map_async(wgpu::MapMode::Read, |_| {}); + + self.device.poll(wgpu::Maintain::Wait); + + let positions_data = positions_slice.get_mapped_range(); + let velocities_data = velocities_slice.get_mapped_range(); + + let positions: Vec = bytemuck::cast_slice(&positions_data).to_vec(); + let velocities: Vec = bytemuck::cast_slice(&velocities_data).to_vec(); + + drop(positions_data); + drop(velocities_data); + + positions_staging.unmap(); + velocities_staging.unmap(); + + (positions, velocities) + } + + /// Helper to convert Rapier vector to GPU format (3D). + #[cfg(feature = "dim3")] + fn vector_to_gpu_from_vec3(v: &Vec3) -> GpuVector3 { + GpuVector3 { + x: v.x, + y: v.y, + z: v.z, + _padding: 0.0, + } + } + + /// Helper to convert Rapier vector to GPU format (2D). + #[cfg(feature = "dim2")] + fn vector_to_gpu_from_vec2(v: &Vec2) -> GpuVector3 { + GpuVector3 { + x: v.x, + y: v.y, + z: 0.0, + _padding: 0.0, + } + } + + /// Helper to convert Rapier vector to GPU format. + fn vector_to_gpu(v: &na::Vector3) -> GpuVector3 { + GpuVector3 { + x: v.x, + y: v.y, + z: v.z, + _padding: 0.0, + } + } + + /// Helper to convert Rapier rotation to GPU format (3D). + #[cfg(feature = "dim3")] + fn rotation_to_gpu(rot: &Quat) -> GpuRotation { + GpuRotation { + x: rot.x, + y: rot.y, + z: rot.z, + w: rot.w, + } + } + + /// Helper to convert Rapier rotation to GPU format (2D). + #[cfg(feature = "dim2")] + fn rotation_to_gpu(rot: &crate::glamx::Rot2) -> GpuRotation { + GpuRotation { + angle: rot.angle(), + _padding: [0.0, 0.0, 0.0], + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_gpu_vector3_size() { + assert_eq!(std::mem::size_of::(), 16); + assert_eq!(std::mem::align_of::(), 4); + } + + #[test] + fn test_gpu_rotation_size() { + #[cfg(feature = "dim3")] + assert_eq!(std::mem::size_of::(), 16); + #[cfg(feature = "dim2")] + assert_eq!(std::mem::size_of::(), 16); + } +} diff --git a/src/gpu/device.rs b/src/gpu/device.rs new file mode 100644 index 000000000..877b60373 --- /dev/null +++ b/src/gpu/device.rs @@ -0,0 +1,171 @@ +//! GPU device initialization and management. + +use std::sync::Arc; +use wgpu; + +/// GPU context managing WGPU device, queue, and adapter. +/// +/// This is the main entry point for GPU acceleration. It handles: +/// - Device selection (prefers discrete GPUs) +/// - Feature validation (compute shaders required) +/// - Adapter capabilities checking +pub struct GpuContext { + pub device: Arc, + pub queue: Arc, + pub adapter: wgpu::Adapter, +} + +/// Errors that can occur during GPU initialization. +#[derive(Debug, thiserror::Error)] +pub enum GpuError { + #[error("No suitable GPU adapter found")] + NoAdapter, + + #[error("Failed to request device: {0}")] + DeviceRequest(#[from] wgpu::RequestDeviceError), + + #[error("Compute shaders not supported on this device")] + ComputeNotSupported, + + #[error("Insufficient GPU memory (required: {required} bytes, available: {available} bytes)")] + InsufficientMemory { required: u64, available: u64 }, +} + +impl GpuContext { + /// Creates a new GPU context with default settings. + /// + /// Prefers discrete GPUs over integrated ones for better performance. + /// Falls back gracefully if no suitable GPU is found. + /// + /// # Errors + /// + /// Returns `GpuError` if: + /// - No GPU adapter is found + /// - Compute shaders are not supported + /// - Device request fails + /// + /// # Example + /// + /// ```no_run + /// # use rapier3d::gpu::GpuContext; + /// let gpu = GpuContext::new().expect("Failed to initialize GPU"); + /// println!("Using GPU: {:?}", gpu.adapter_info().name); + /// ``` + pub fn new() -> Result { + pollster::block_on(Self::new_async()) + } + + /// Async version of `new()` for async runtimes. + pub async fn new_async() -> Result { + let instance = wgpu::Instance::new(wgpu::InstanceDescriptor { + backends: wgpu::Backends::all(), + ..Default::default() + }); + + // Request adapter with preference for discrete GPU + let adapter = instance + .request_adapter(&wgpu::RequestAdapterOptions { + power_preference: wgpu::PowerPreference::HighPerformance, + compatible_surface: None, + force_fallback_adapter: false, + }) + .await + .ok_or(GpuError::NoAdapter)?; + + let adapter_info = adapter.get_info(); + log::info!("Selected GPU: {} ({:?})", adapter_info.name, adapter_info.backend); + log::info!("GPU Type: {:?}", adapter_info.device_type); + + // Verify compute shader support + let features = adapter.features(); + if !features.contains(wgpu::Features::empty()) { + // Basic compute is always available in WebGPU + log::debug!("Compute shaders supported"); + } + + // Request device with required features + let (device, queue) = adapter + .request_device( + &wgpu::DeviceDescriptor { + label: Some("Rapier GPU Device"), + required_features: wgpu::Features::empty(), // Compute is baseline + required_limits: wgpu::Limits { + max_compute_workgroup_size_x: 256, + max_compute_workgroup_size_y: 256, + max_compute_workgroup_size_z: 64, + max_compute_invocations_per_workgroup: 256, + max_compute_workgroups_per_dimension: 65535, + ..Default::default() + }, + memory_hints: Default::default(), + }, + None, + ) + .await?; + + log::info!("GPU device initialized successfully"); + + Ok(Self { + device: Arc::new(device), + queue: Arc::new(queue), + adapter, + }) + } + + /// Returns information about the selected GPU adapter. + pub fn adapter_info(&self) -> wgpu::AdapterInfo { + self.adapter.get_info() + } + + /// Returns the limits of the GPU device. + pub fn limits(&self) -> wgpu::Limits { + self.device.limits() + } + + /// Checks if the GPU has sufficient memory for the given requirement. + pub fn check_memory_requirement(&self, required_bytes: u64) -> Result<(), GpuError> { + // WGPU doesn't expose memory info directly, but we can check limits + let limits = self.limits(); + let max_buffer_size = limits.max_buffer_size; + + if required_bytes > max_buffer_size { + return Err(GpuError::InsufficientMemory { + required: required_bytes, + available: max_buffer_size, + }); + } + + Ok(()) + } + + /// Returns the maximum workgroup size for compute shaders. + pub fn max_workgroup_size(&self) -> (u32, u32, u32) { + let limits = self.limits(); + ( + limits.max_compute_workgroup_size_x, + limits.max_compute_workgroup_size_y, + limits.max_compute_workgroup_size_z, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_gpu_context_creation() { + // This test requires a GPU to be available + if let Ok(ctx) = GpuContext::new() { + let info = ctx.adapter_info(); + println!("GPU: {} ({:?})", info.name, info.backend); + + let (x, y, z) = ctx.max_workgroup_size(); + assert!(x >= 256, "Workgroup X size too small"); + assert!(y >= 256, "Workgroup Y size too small"); + assert!(z >= 64, "Workgroup Z size too small"); + } else { + println!("Skipping GPU test - no suitable GPU found"); + } + } +} diff --git a/src/gpu/integrator.rs b/src/gpu/integrator.rs new file mode 100644 index 000000000..447b8ba3f --- /dev/null +++ b/src/gpu/integrator.rs @@ -0,0 +1,251 @@ +/// GPU-accelerated integration kernel. +/// +/// Performs symplectic Euler integration on the GPU: +/// - Velocity integration: v' = v + (F/m + g) * dt +/// - Position integration: p' = p + v' * dt +/// - Angular damping and velocity update + +use crate::gpu::{GpuContext, BufferManager, RigidBodyGpuBuffer}; +use crate::dynamics::RigidBodySet; +use bytemuck::{Pod, Zeroable}; +use wgpu::util::DeviceExt; + +/// Integration parameters passed to GPU shader +#[repr(C)] +#[derive(Copy, Clone, Pod, Zeroable)] +struct IntegrationParams { + body_count: u32, + dt: f32, + gravity_x: f32, + gravity_y: f32, + gravity_z: f32, + linear_damping: f32, + angular_damping: f32, + _padding: f32, +} + +/// GPU integration kernel +pub struct GpuIntegrator { + pipeline: wgpu::ComputePipeline, + bind_group_layout: wgpu::BindGroupLayout, +} + +impl GpuIntegrator { + /// Create a new GPU integrator + pub fn new(device: &wgpu::Device) -> Self { + // Load shader + let shader_source = include_str!("shaders/integration.wgsl"); + let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some("Integration Shader"), + source: wgpu::ShaderSource::Wgsl(shader_source.into()), + }); + + // Create bind group layout + let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: Some("Integration Bind Group Layout"), + entries: &[ + // Uniform buffer: params + wgpu::BindGroupLayoutEntry { + binding: 0, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + // Storage buffers (read_write) + wgpu::BindGroupLayoutEntry { + binding: 1, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + // Rotations (binding 2) + wgpu::BindGroupLayoutEntry { + binding: 2, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + // Lin velocities (binding 3) + wgpu::BindGroupLayoutEntry { + binding: 3, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + // Ang velocities (binding 4) + wgpu::BindGroupLayoutEntry { + binding: 4, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + // Forces (binding 5, read-only) + wgpu::BindGroupLayoutEntry { + binding: 5, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + // Torques (binding 6, read-only) + wgpu::BindGroupLayoutEntry { + binding: 6, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + // Inv masses (binding 7, read-only) + wgpu::BindGroupLayoutEntry { + binding: 7, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + ], + }); + + // Create pipeline layout + let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: Some("Integration Pipeline Layout"), + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + + // Create compute pipeline + let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: Some("Integration Pipeline"), + layout: Some(&pipeline_layout), + module: &shader, + entry_point: "main", + compilation_options: Default::default(), + cache: None, + }); + + Self { + pipeline, + bind_group_layout, + } + } + + /// Run integration on GPU + pub fn integrate( + &self, + device: &wgpu::Device, + queue: &wgpu::Queue, + gpu_buffer: &mut RigidBodyGpuBuffer, + dt: f32, + gravity: [f32; 3], + linear_damping: f32, + angular_damping: f32, + ) { + let params = IntegrationParams { + body_count: gpu_buffer.body_count as u32, + dt, + gravity_x: gravity[0], + gravity_y: gravity[1], + gravity_z: gravity[2], + linear_damping, + angular_damping, + _padding: 0.0, + }; + + // Create params buffer + let params_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("Integration Params"), + contents: bytemuck::cast_slice(&[params]), + usage: wgpu::BufferUsages::UNIFORM, + }); + + // Create bind group + let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { + label: Some("Integration Bind Group"), + layout: &self.bind_group_layout, + entries: &[ + wgpu::BindGroupEntry { + binding: 0, + resource: params_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 1, + resource: gpu_buffer.positions_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 2, + resource: gpu_buffer.rotations_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 3, + resource: gpu_buffer.lin_velocities_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 4, + resource: gpu_buffer.ang_velocities_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 5, + resource: gpu_buffer.forces_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 6, + resource: gpu_buffer.torques_buffer.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 7, + resource: gpu_buffer.inv_masses_buffer.as_entire_binding(), + }, + ], + }); + + // Create command encoder and dispatch compute + let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: Some("Integration Encoder"), + }); + + { + let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("Integration Pass"), + timestamp_writes: None, + }); + + compute_pass.set_pipeline(&self.pipeline); + compute_pass.set_bind_group(0, &bind_group, &[]); + + // Dispatch workgroups (256 threads per workgroup) + let workgroup_count = (gpu_buffer.body_count as u32 + 255) / 256; + compute_pass.dispatch_workgroups(workgroup_count, 1, 1); + } + + queue.submit(Some(encoder.finish())); + } +} diff --git a/src/gpu/mod.rs b/src/gpu/mod.rs new file mode 100644 index 000000000..8bfa9e518 --- /dev/null +++ b/src/gpu/mod.rs @@ -0,0 +1,66 @@ +//! GPU acceleration module using WGPU compute shaders. +//! +//! This module provides GPU-accelerated implementations of compute-intensive +//! physics operations including collision detection, constraint solving, and +//! integration. It's designed as an optional, incremental enhancement that +//! coexists with the existing CPU implementation. +//! +//! # Features +//! +//! Enable GPU acceleration with the `gpu-acceleration` feature flag: +//! ```toml +//! rapier3d = { version = "0.32", features = ["gpu-acceleration"] } +//! ``` +//! +//! # Architecture +//! +//! - **GpuContext**: Manages WGPU device and queue lifecycle +//! - **BufferManager**: Handles CPU↔GPU data transfers +//! - **GpuIntegrator**: Position/velocity integration on GPU +//! - **GpuBroadPhase**: BVH-based collision detection (future) +//! - **GpuConstraintSolver**: Constraint resolution (future) +//! +//! # Example +//! +//! ```no_run +//! # #[cfg(feature = "gpu-acceleration")] +//! # { +//! use rapier3d::prelude::*; +//! use rapier3d::gpu::GpuContext; +//! +//! // Initialize GPU context +//! let gpu_ctx = GpuContext::new().expect("Failed to initialize GPU"); +//! +//! // Use in physics pipeline (future API) +//! let pipeline = PhysicsPipeline::new() +//! .with_gpu_context(gpu_ctx); +//! # } +//! ``` + +#[cfg(feature = "gpu-acceleration")] +mod device; +#[cfg(feature = "gpu-acceleration")] +mod buffer_manager; +#[cfg(feature = "gpu-acceleration")] +mod pipeline; +#[cfg(feature = "gpu-acceleration")] +mod integrator; +#[cfg(feature = "gpu-acceleration")] +mod resident_state; +#[cfg(feature = "gpu-acceleration")] +mod tests; + +#[cfg(feature = "gpu-acceleration")] +pub use device::GpuContext; +#[cfg(feature = "gpu-acceleration")] +pub use buffer_manager::{BufferManager, RigidBodyGpuBuffer}; +#[cfg(feature = "gpu-acceleration")] +pub use pipeline::GpuComputePipeline; +#[cfg(feature = "gpu-acceleration")] +pub use integrator::GpuIntegrator; +#[cfg(feature = "gpu-acceleration")] +pub use resident_state::GpuResidentState; + +/// Re-export WGPU types for convenience +#[cfg(feature = "gpu-acceleration")] +pub use wgpu; diff --git a/src/gpu/pipeline.rs b/src/gpu/pipeline.rs new file mode 100644 index 000000000..840c76976 --- /dev/null +++ b/src/gpu/pipeline.rs @@ -0,0 +1,155 @@ +//! Compute pipeline abstraction for GPU operations. + +use wgpu; + +/// Wrapper for WGPU compute pipelines. +/// +/// Provides a simplified interface for creating and executing compute shaders. +pub struct GpuComputePipeline { + pipeline: wgpu::ComputePipeline, + bind_group_layout: wgpu::BindGroupLayout, +} + +impl GpuComputePipeline { + /// Creates a new compute pipeline from WGSL shader source. + /// + /// # Arguments + /// + /// * `device` - WGPU device + /// * `shader_source` - WGSL shader source code + /// * `entry_point` - Entry point function name (usually "main") + /// * `bind_group_layout_entries` - Buffer binding descriptions + pub fn new( + device: &wgpu::Device, + shader_source: &str, + entry_point: &str, + bind_group_layout_entries: &[wgpu::BindGroupLayoutEntry], + ) -> Self { + let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some("Compute Shader"), + source: wgpu::ShaderSource::Wgsl(shader_source.into()), + }); + + let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: Some("Compute Bind Group Layout"), + entries: bind_group_layout_entries, + }); + + let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: Some("Compute Pipeline Layout"), + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + + let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: Some("Compute Pipeline"), + layout: Some(&pipeline_layout), + module: &shader_module, + entry_point: entry_point, + compilation_options: Default::default(), + cache: None, + }); + + Self { + pipeline, + bind_group_layout, + } + } + + /// Creates a bind group for this pipeline. + /// + /// # Arguments + /// + /// * `device` - WGPU device + /// * `buffers` - Buffer bindings (must match bind_group_layout order) + pub fn create_bind_group( + &self, + device: &wgpu::Device, + buffers: &[&wgpu::Buffer], + ) -> wgpu::BindGroup { + let entries: Vec = buffers + .iter() + .enumerate() + .map(|(i, buffer)| wgpu::BindGroupEntry { + binding: i as u32, + resource: buffer.as_entire_binding(), + }) + .collect(); + + device.create_bind_group(&wgpu::BindGroupDescriptor { + label: Some("Compute Bind Group"), + layout: &self.bind_group_layout, + entries: &entries, + }) + } + + /// Dispatches the compute shader. + /// + /// # Arguments + /// + /// * `encoder` - Command encoder + /// * `bind_group` - Bind group with buffer bindings + /// * `workgroups` - Number of workgroups in (x, y, z) + pub fn dispatch( + &self, + encoder: &mut wgpu::CommandEncoder, + bind_group: &wgpu::BindGroup, + workgroups: (u32, u32, u32), + ) { + let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("Compute Pass"), + timestamp_writes: None, + }); + + compute_pass.set_pipeline(&self.pipeline); + compute_pass.set_bind_group(0, bind_group, &[]); + compute_pass.dispatch_workgroups(workgroups.0, workgroups.1, workgroups.2); + } +} + +/// Helper to create a standard storage buffer binding layout entry. +pub fn storage_buffer_binding(binding: u32, read_only: bool) -> wgpu::BindGroupLayoutEntry { + wgpu::BindGroupLayoutEntry { + binding, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + } +} + +/// Helper to create a uniform buffer binding layout entry. +pub fn uniform_buffer_binding(binding: u32) -> wgpu::BindGroupLayoutEntry { + wgpu::BindGroupLayoutEntry { + binding, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_storage_buffer_binding() { + let entry = storage_buffer_binding(0, true); + assert_eq!(entry.binding, 0); + assert_eq!(entry.visibility, wgpu::ShaderStages::COMPUTE); + } + + #[test] + fn test_uniform_buffer_binding() { + let entry = uniform_buffer_binding(1); + assert_eq!(entry.binding, 1); + assert_eq!(entry.visibility, wgpu::ShaderStages::COMPUTE); + } +} diff --git a/src/gpu/resident_state.rs b/src/gpu/resident_state.rs new file mode 100644 index 000000000..bd0d1f09b --- /dev/null +++ b/src/gpu/resident_state.rs @@ -0,0 +1,210 @@ +//! GPU-resident physics state manager. +//! +//! This module implements the PhysX-style architecture where rigid body +//! state lives permanently on the GPU. Only deltas (new/changed/removed +//! bodies) are transferred, minimizing CPU↔GPU bandwidth. + +use crate::dynamics::{RigidBodyHandle, RigidBodySet}; +use super::buffer_manager::{BufferManager, RigidBodyGpuBuffer}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +/// Tracks which bodies have been modified and need GPU updates. +#[derive(Default)] +struct DirtyTracker { + /// Bodies added since last sync + added: HashSet, + /// Bodies removed since last sync + removed: HashSet, + /// Bodies whose state changed (forces, velocities, etc.) + modified: HashSet, +} + +impl DirtyTracker { + fn clear(&mut self) { + self.added.clear(); + self.removed.clear(); + self.modified.clear(); + } + + fn is_empty(&self) -> bool { + self.added.is_empty() && self.removed.is_empty() && self.modified.is_empty() + } +} + +/// GPU-resident rigid body state with delta tracking. +/// +/// Architecture: +/// - Body data lives on GPU permanently +/// - CPU tracks which bodies are dirty (added/modified/removed) +/// - `sync_to_gpu()` uploads only deltas +/// - `readback_for_rendering()` downloads only positions/rotations +pub struct GpuResidentState { + device: Arc, + queue: Arc, + buffer_manager: BufferManager, + + /// GPU buffers (persistent) + gpu_buffer: Option, + + /// Maps RigidBodyHandle → GPU buffer index + handle_to_index: HashMap, + + /// Dirty tracking for incremental updates + dirty: DirtyTracker, + + /// Current capacity + capacity: usize, + + /// Total bodies currently on GPU + gpu_body_count: usize, +} + +impl GpuResidentState { + /// Create a new GPU-resident state manager. + pub fn new(device: Arc, queue: Arc, initial_capacity: usize) -> Self { + let buffer_manager = BufferManager::new(device.clone(), queue.clone()); + let gpu_buffer = Some(buffer_manager.create_rigid_body_buffer(initial_capacity)); + + Self { + device, + queue, + buffer_manager, + gpu_buffer, + handle_to_index: HashMap::new(), + dirty: DirtyTracker::default(), + capacity: initial_capacity, + gpu_body_count: 0, + } + } + + /// Mark a body as added (will be uploaded on next sync). + pub fn mark_added(&mut self, handle: RigidBodyHandle) { + self.dirty.added.insert(handle); + } + + /// Mark a body as modified (will be re-uploaded on next sync). + pub fn mark_modified(&mut self, handle: RigidBodyHandle) { + if !self.dirty.added.contains(&handle) { + self.dirty.modified.insert(handle); + } + } + + /// Mark a body as removed (will be deleted from GPU on next sync). + pub fn mark_removed(&mut self, handle: RigidBodyHandle) { + self.dirty.removed.insert(handle); + self.dirty.added.remove(&handle); + self.dirty.modified.remove(&handle); + } + + /// Sync dirty bodies to GPU (incremental upload). + /// + /// This is the key optimization: only upload what changed! + pub fn sync_to_gpu(&mut self, bodies: &RigidBodySet) { + if self.dirty.is_empty() { + return; // Nothing to do! + } + + // Handle removals + for handle in &self.dirty.removed { + if let Some(index) = self.handle_to_index.remove(handle) { + // Swap-remove: move last body to this slot + if index < self.gpu_body_count - 1 { + // Find handle of last body and update its index + let last_handle = self.handle_to_index + .iter() + .find(|(_, &idx)| idx == self.gpu_body_count - 1) + .map(|(h, _)| *h); + + if let Some(last_handle) = last_handle { + self.handle_to_index.insert(last_handle, index); + // Mark last body as modified so it gets copied to new slot + self.dirty.modified.insert(last_handle); + } + } + self.gpu_body_count -= 1; + } + } + + // Handle additions + for handle in &self.dirty.added { + if self.gpu_body_count >= self.capacity { + // Resize GPU buffers + self.resize_gpu_buffers(self.capacity * 2); + } + + let index = self.gpu_body_count; + self.handle_to_index.insert(*handle, index); + self.gpu_body_count += 1; + } + + // Upload modified + added bodies + let mut bodies_to_upload = self.dirty.added.iter() + .chain(self.dirty.modified.iter()) + .filter_map(|h| bodies.get(*h).map(|b| (*h, b))) + .collect::>(); + + if !bodies_to_upload.is_empty() { + // TODO: Implement partial buffer upload (currently uploads all) + // For now, upload entire buffer (suboptimal but works) + self.buffer_manager.upload_rigid_bodies(bodies, self.gpu_buffer.as_mut().unwrap()); + } + + self.dirty.clear(); + } + + /// Readback positions/rotations for rendering (minimal data transfer). + /// + /// Only downloads what's needed for visualization - doesn't sync full state. + pub fn readback_for_rendering(&self) -> (Vec, Vec) { + if let Some(ref gpu_buffer) = self.gpu_buffer { + self.buffer_manager.download_rigid_bodies(gpu_buffer) + } else { + (Vec::new(), Vec::new()) + } + } + + /// Get GPU buffer for compute operations. + pub fn gpu_buffer_mut(&mut self) -> Option<&mut RigidBodyGpuBuffer> { + self.gpu_buffer.as_mut() + } + + /// Get current GPU body count. + pub fn body_count(&self) -> usize { + self.gpu_body_count + } + + /// Resize GPU buffers to new capacity. + fn resize_gpu_buffers(&mut self, new_capacity: usize) { + log::info!("Resizing GPU buffers: {} → {}", self.capacity, new_capacity); + + // Create new larger buffer + let new_buffer = self.buffer_manager.create_rigid_body_buffer(new_capacity); + + // TODO: Copy old buffer data to new buffer on GPU (avoid CPU roundtrip) + // For now, mark all bodies as modified to trigger re-upload + for handle in self.handle_to_index.keys() { + self.dirty.modified.insert(*handle); + } + + self.gpu_buffer = Some(new_buffer); + self.capacity = new_capacity; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_dirty_tracking() { + let mut tracker = DirtyTracker::default(); + let handle = RigidBodyHandle::from_raw_parts(0, 0); + + tracker.added.insert(handle); + assert!(!tracker.is_empty()); + + tracker.clear(); + assert!(tracker.is_empty()); + } +} diff --git a/src/gpu/shaders/integration.wgsl b/src/gpu/shaders/integration.wgsl new file mode 100644 index 000000000..55c19c4b8 --- /dev/null +++ b/src/gpu/shaders/integration.wgsl @@ -0,0 +1,98 @@ +// Integration Kernel - Symplectic Euler Integration +// +// Computes next frame state for rigid bodies: +// 1. Velocity integration: v' = v + (F/m + gravity) * dt +// 2. Position integration: p' = p + v' * dt +// 3. Angular integration: ω' = I⁻¹(τ * dt), rotation update +// 4. Apply linear/angular damping + +struct IntegrationParams { + body_count: u32, + dt: f32, + gravity_x: f32, + gravity_y: f32, + gravity_z: f32, + linear_damping: f32, + angular_damping: f32, + _padding: f32, +} + +struct GpuVector3 { + x: f32, + y: f32, + z: f32, + _padding: f32, +} + +struct GpuRotation { + x: f32, + y: f32, + z: f32, + w: f32, +} + +@group(0) @binding(0) var params: IntegrationParams; +@group(0) @binding(1) var positions: array; +@group(0) @binding(2) var rotations: array; +@group(0) @binding(3) var lin_velocities: array; +@group(0) @binding(4) var ang_velocities: array; +@group(0) @binding(5) var forces: array; +@group(0) @binding(6) var torques: array; +@group(0) @binding(7) var inv_masses: array; + +@compute @workgroup_size(256) +fn main(@builtin(global_invocation_id) global_id: vec3) { + let idx = global_id.x; + + if (idx >= params.body_count) { + return; + } + + let inv_mass = inv_masses[idx]; + + // Skip fixed/static bodies (infinite mass = 0 inv_mass) + if (inv_mass == 0.0) { + return; + } + + // Load current state + var pos = vec3(positions[idx].x, positions[idx].y, positions[idx].z); + var vel = vec3(lin_velocities[idx].x, lin_velocities[idx].y, lin_velocities[idx].z); + var ang_vel = vec3(ang_velocities[idx].x, ang_velocities[idx].y, ang_velocities[idx].z); + let force = vec3(forces[idx].x, forces[idx].y, forces[idx].z); + let torque = vec3(torques[idx].x, torques[idx].y, torques[idx].z); + + let gravity = vec3(params.gravity_x, params.gravity_y, params.gravity_z); + + // 1. Velocity integration: v' = v + (F/m + g) * dt + let acceleration = force * inv_mass + gravity; + vel = vel + acceleration * params.dt; + + // 2. Apply linear damping: v' = v * (1 - damping)^dt + // Approximate: v' ≈ v * (1 - damping * dt) for small dt + vel = vel * (1.0 - params.linear_damping * params.dt); + + // 3. Position integration: p' = p + v * dt (symplectic Euler) + pos = pos + vel * params.dt; + + // 4. Angular velocity damping + ang_vel = ang_vel * (1.0 - params.angular_damping * params.dt); + + // 5. Rotation integration (simplified - quaternion integration would go here) + // For now, we'll just update angular velocity + // Full quaternion integration: q' = q + 0.5 * q * [0, ω] * dt + // TODO: Implement proper quaternion integration + + // Write back updated state + positions[idx].x = pos.x; + positions[idx].y = pos.y; + positions[idx].z = pos.z; + + lin_velocities[idx].x = vel.x; + lin_velocities[idx].y = vel.y; + lin_velocities[idx].z = vel.z; + + ang_velocities[idx].x = ang_vel.x; + ang_velocities[idx].y = ang_vel.y; + ang_velocities[idx].z = ang_vel.z; +} diff --git a/src/gpu/tests.rs b/src/gpu/tests.rs new file mode 100644 index 000000000..746fe39f0 --- /dev/null +++ b/src/gpu/tests.rs @@ -0,0 +1,313 @@ +//! GPU validation tests - compare GPU results against CPU reference. +//! +//! This module provides a testing framework to ensure GPU-accelerated +//! physics produces identical (or numerically equivalent) results to +//! the CPU implementation. + +#[cfg(all(test, feature = "gpu-acceleration"))] +mod tests { + use crate::dynamics::{RigidBodyBuilder, RigidBodySet}; + use crate::geometry::{ColliderBuilder, ColliderSet}; + use crate::gpu::{GpuContext, BufferManager}; + use crate::math::{Real, Vector}; + use approx::assert_relative_eq; + + /// Tolerance for floating-point comparisons (CPU vs GPU). + const TOLERANCE: Real = 1e-5; + + /// Helper to create a test scene with falling bodies. + fn create_test_scene() -> (RigidBodySet, ColliderSet) { + let mut bodies = RigidBodySet::new(); + let mut colliders = ColliderSet::new(); + + // Create a dynamic body + let rb = RigidBodyBuilder::dynamic() + .translation(Vector::new(0.0, 10.0, 0.0)) + .linvel(Vector::new(1.0, -2.0, 0.5)) + .build(); + let rb_handle = bodies.insert(rb); + + // Add a collider + #[cfg(feature = "dim3")] + let collider = ColliderBuilder::ball(0.5).build(); + #[cfg(feature = "dim2")] + let collider = ColliderBuilder::ball(0.5).build(); + + colliders.insert_with_parent(collider, rb_handle, &mut bodies); + + // Create another dynamic body + let rb2 = RigidBodyBuilder::dynamic() + .translation(Vector::new(2.0, 5.0, -1.0)) + .linvel(Vector::new(-0.5, 1.0, 0.0)) + .build(); + let rb2_handle = bodies.insert(rb2); + + #[cfg(feature = "dim3")] + let collider2 = ColliderBuilder::cuboid(0.5, 0.5, 0.5).build(); + #[cfg(feature = "dim2")] + let collider2 = ColliderBuilder::cuboid(0.5, 0.5).build(); + + colliders.insert_with_parent(collider2, rb2_handle, &mut bodies); + + (bodies, colliders) + } + + /// Helper to compare two rigid body sets for equality. + fn assert_bodies_equivalent(cpu_bodies: &RigidBodySet, gpu_bodies: &RigidBodySet) { + assert_eq!( + cpu_bodies.len(), + gpu_bodies.len(), + "Body count mismatch" + ); + + for (handle, cpu_body) in cpu_bodies.iter() { + let gpu_body = gpu_bodies + .get(handle) + .expect("GPU missing body that CPU has"); + + // Compare positions + let cpu_pos = &cpu_body.position().translation; + let gpu_pos = &gpu_body.position().translation; + + #[cfg(feature = "dim3")] + { + assert_relative_eq!(cpu_pos.x, gpu_pos.x, epsilon = TOLERANCE); + assert_relative_eq!(cpu_pos.y, gpu_pos.y, epsilon = TOLERANCE); + assert_relative_eq!(cpu_pos.z, gpu_pos.z, epsilon = TOLERANCE); + } + #[cfg(feature = "dim2")] + { + assert_relative_eq!(cpu_pos.x, gpu_pos.x, epsilon = TOLERANCE); + assert_relative_eq!(cpu_pos.y, gpu_pos.y, epsilon = TOLERANCE); + } + + // Compare velocities + let cpu_vel = cpu_body.linvel(); + let gpu_vel = gpu_body.linvel(); + + #[cfg(feature = "dim3")] + { + assert_relative_eq!(cpu_vel.x, gpu_vel.x, epsilon = TOLERANCE); + assert_relative_eq!(cpu_vel.y, gpu_vel.y, epsilon = TOLERANCE); + assert_relative_eq!(cpu_vel.z, gpu_vel.z, epsilon = TOLERANCE); + } + #[cfg(feature = "dim2")] + { + assert_relative_eq!(cpu_vel.x, gpu_vel.x, epsilon = TOLERANCE); + assert_relative_eq!(cpu_vel.y, gpu_vel.y, epsilon = TOLERANCE); + } + + // Compare angular velocity + #[cfg(feature = "dim3")] + { + let cpu_angvel = cpu_body.angvel(); + let gpu_angvel = gpu_body.angvel(); + assert_relative_eq!(cpu_angvel.x, gpu_angvel.x, epsilon = TOLERANCE); + assert_relative_eq!(cpu_angvel.y, gpu_angvel.y, epsilon = TOLERANCE); + assert_relative_eq!(cpu_angvel.z, gpu_angvel.z, epsilon = TOLERANCE); + } + #[cfg(feature = "dim2")] + { + let cpu_angvel = cpu_body.angvel(); + let gpu_angvel = gpu_body.angvel(); + assert_relative_eq!(*cpu_angvel, *gpu_angvel, epsilon = TOLERANCE); + } + } + } + + #[test] + fn test_gpu_context_initialization() { + // Test that we can initialize GPU context + let gpu_ctx = GpuContext::new(); + + if gpu_ctx.is_err() { + println!("Skipping GPU test - no suitable GPU found"); + return; + } + + let gpu_ctx = gpu_ctx.unwrap(); + let info = gpu_ctx.adapter_info(); + println!("GPU: {} ({:?})", info.name, info.backend); + } + + #[test] + fn test_buffer_upload_download() { + // Test that we can upload and download data to GPU + let gpu_ctx = match GpuContext::new() { + Ok(ctx) => ctx, + Err(_) => { + println!("Skipping GPU test - no suitable GPU found"); + return; + } + }; + + let (bodies, _colliders) = create_test_scene(); + + let buffer_manager = BufferManager::new( + gpu_ctx.device, + gpu_ctx.queue + ); + + let mut gpu_buffer = buffer_manager.create_rigid_body_buffer(bodies.len()); + + // Upload to GPU + buffer_manager.upload_rigid_bodies(&bodies, &mut gpu_buffer); + + println!( + "Uploaded {} bodies to GPU buffer (capacity: {})", + gpu_buffer.body_count, + gpu_buffer.capacity + ); + + // TODO: Add download and comparison when we implement readback + assert_eq!(gpu_buffer.body_count, bodies.len()); + } + + #[test] + #[ignore] // Enable once integration kernel is implemented + fn test_integration_cpu_vs_gpu() { + // This test will compare CPU integration against GPU integration + let gpu_ctx = match GpuContext::new() { + Ok(ctx) => ctx, + Err(_) => { + println!("Skipping GPU test - no suitable GPU found"); + return; + } + }; + + let (mut cpu_bodies, _colliders) = create_test_scene(); + let (mut gpu_bodies, _) = create_test_scene(); + + let dt = 0.016; // 60 FPS + + // TODO: Run CPU integration + // for (_handle, body) in cpu_bodies.iter_mut() { + // body.integrate_forces(dt); + // } + + // TODO: Run GPU integration + // let buffer_manager = BufferManager::new( + // gpu_ctx.device.clone(), + // gpu_ctx.queue.clone() + // ); + // gpu_integrator.integrate(&mut gpu_bodies, dt); + + // Compare results + assert_bodies_equivalent(&cpu_bodies, &gpu_bodies); + } + + #[test] + #[ignore] // Enable once collision detection is implemented + fn test_collision_detection_cpu_vs_gpu() { + let gpu_ctx = match GpuContext::new() { + Ok(ctx) => ctx, + Err(_) => { + println!("Skipping GPU test - no suitable GPU found"); + return; + } + }; + + // TODO: Create scene with colliding bodies + // TODO: Run CPU collision detection + // TODO: Run GPU collision detection + // TODO: Compare contact manifolds + + println!("GPU collision detection test (not yet implemented)"); + } + + #[test] + #[ignore] // Enable once constraint solver is implemented + fn test_solver_cpu_vs_gpu() { + let gpu_ctx = match GpuContext::new() { + Ok(ctx) => ctx, + Err(_) => { + println!("Skipping GPU test - no suitable GPU found"); + return; + } + }; + + // TODO: Create scene with constraints + // TODO: Run CPU solver + // TODO: Run GPU solver + // TODO: Compare constraint impulses and body velocities + + println!("GPU solver test (not yet implemented)"); + } + + #[test] + #[ignore] // Enable for long-running stability tests + fn test_long_simulation_cpu_vs_gpu() { + // Run a 10-second simulation and ensure CPU and GPU stay in sync + let gpu_ctx = match GpuContext::new() { + Ok(ctx) => ctx, + Err(_) => { + println!("Skipping GPU test - no suitable GPU found"); + return; + } + }; + + let (mut cpu_bodies, cpu_colliders) = create_test_scene(); + let (mut gpu_bodies, gpu_colliders) = create_test_scene(); + + let dt = 0.016; // 60 FPS + let steps = 600; // 10 seconds + + for step in 0..steps { + // TODO: Step CPU physics + // TODO: Step GPU physics + + // Compare every 60 frames (1 second) + if step % 60 == 0 { + println!("Comparing at step {}", step); + assert_bodies_equivalent(&cpu_bodies, &gpu_bodies); + } + } + + // Final comparison + assert_bodies_equivalent(&cpu_bodies, &gpu_bodies); + } + + #[test] + fn test_stress_test_many_bodies() { + // Test with 1000+ bodies to ensure GPU scales + let gpu_ctx = match GpuContext::new() { + Ok(ctx) => ctx, + Err(_) => { + println!("Skipping GPU test - no suitable GPU found"); + return; + } + }; + + let mut bodies = RigidBodySet::new(); + + // Create 1000 bodies in a grid + for i in 0..10 { + for j in 0..10 { + for k in 0..10 { + #[cfg(feature = "dim3")] + let pos = Vector::new(i as Real * 2.0, j as Real * 2.0, k as Real * 2.0); + #[cfg(feature = "dim2")] + let pos = Vector::new(i as Real * 2.0, j as Real * 2.0); + + let rb = RigidBodyBuilder::dynamic() + .translation(pos) + .build(); + bodies.insert(rb); + } + } + } + + println!("Created {} bodies", bodies.len()); + + let buffer_manager = BufferManager::new( + gpu_ctx.device, + gpu_ctx.queue + ); + + let mut gpu_buffer = buffer_manager.create_rigid_body_buffer(bodies.len()); + buffer_manager.upload_rigid_bodies(&bodies, &mut gpu_buffer); + + assert_eq!(gpu_buffer.body_count, bodies.len()); + println!("Successfully uploaded {} bodies to GPU", gpu_buffer.body_count); + } +} diff --git a/src/lib.rs b/src/lib.rs index 8b3366635..ff9e3dedb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -169,6 +169,9 @@ pub mod geometry; pub mod pipeline; pub mod utils; +#[cfg(feature = "gpu-acceleration")] +pub mod gpu; + /// Elementary mathematical entities (vectors, matrices, isometries, etc). pub mod math { pub use parry::math::*;