diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4c5d80f..4d7a4eb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -117,6 +117,30 @@ jobs: - name: Test (stde) run: cargo test --features stde + - name: Build (bytecode,taylor) + run: cargo build --features bytecode,taylor + + - name: Test (bytecode,taylor) + run: cargo test --features bytecode,taylor + + - name: Build (bytecode,stde) + run: cargo build --features bytecode,stde + + - name: Test (bytecode,stde) + run: cargo test --features bytecode,stde + + - name: Build (taylor,stde) + run: cargo build --features taylor,stde + + - name: Test (taylor,stde) + run: cargo test --features taylor,stde + + - name: Build (bytecode,taylor,stde) + run: cargo build --features bytecode,taylor,stde + + - name: Test (bytecode,taylor,stde) + run: cargo test --features bytecode,taylor,stde + coverage: name: Coverage runs-on: blacksmith-2vcpu-ubuntu-2404 diff --git a/CLEANUP_PLAN.md b/CLEANUP_PLAN.md index 092151c..f62a7a1 100644 --- a/CLEANUP_PLAN.md +++ b/CLEANUP_PLAN.md @@ -2,6 +2,8 @@ **Generated**: 2026-03-13 | **Project**: echidna v0.4.0 | **LOC**: ~23,400 source, ~15,700 tests +> **Status (2026-03-14)**: Phases 0–3 are complete. All safety comments, lint suppression comments, CI gaps, documentation fixes, and test coverage gaps have been addressed. Remaining work is in Phases 4+ (deferred features and aspirational improvements). See ROADMAP.md for the current state. + ## Executive Summary Three specialist agents (Infrastructure, Design, Documentation) reviewed the echidna codebase. The project is in strong shape — zero lint errors, zero dead code, zero debug artifacts, clean architecture with correct dependency direction, and comprehensive test coverage across 37 integration test files. diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..961c759 --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,138 @@ +# Roadmap + +**Version**: v0.4.1+ | **Last updated**: 2026-03-14 + +All core roadmap items (R1–R13) are complete. This document captures forward-looking work: cleanup backlog, infrastructure gaps, deferred features, and aspirational improvements. + +For the historical implementation log, see [docs/roadmap.md](docs/roadmap.md). For deferred/rejected rationale, see [docs/adr-deferred-work.md](docs/adr-deferred-work.md). + +--- + +## Phase 0: Foundation ✅ + +Safety and compliance items with no dependencies. **Complete** (v0.4.1). + +| # | Item | Status | +|---|------|--------| +| 0.1 | Add SAFETY comments to 13 unsafe blocks | ✅ Done | +| 0.2 | Add explanatory comments to 8 `#[allow]` suppressions | ✅ Done | + +--- + +## Phase 1: Cleanup ✅ + +Code duplication consolidation. **Complete** — all items were already addressed in v0.4.1 codebase review. + +| # | Item | Status | +|---|------|--------| +| 1.1 | Consolidate `greedy_coloring` → delegate to `greedy_distance1_coloring` | ✅ Already delegates | +| 1.2 | Consolidate `sparse_hessian` → call `sparse_hessian_with_pattern` | ✅ Already a wrapper | +| 1.3 | Extract shared opcode dispatch from `forward`/`forward_into` | ✅ Uses `forward_dispatch` helper | +| 1.4 | Consolidate `column_coloring`/`row_coloring` → generic helper | ✅ Delegates to `intersection_graph_coloring` | +| 1.5 | Extract helper from `GpuTapeData::from_tape`/`from_tape_f64_lossy` | ✅ Shares `build_from_tape` | + +--- + +## Phase 2: Infrastructure ✅ + +CI and workflow gaps. **Complete**. + +| # | Item | Status | +|---|------|--------| +| 2.1 | Add `diffop` feature to CI test and lint jobs | ✅ Done (v0.4.1) | +| 2.2 | Add `parallel` feature to `publish.yml` pre-publish validation | ✅ Done (v0.4.1) | +| 2.3 | Expand MSRV job to test key feature combinations | ✅ Done — tests bytecode, taylor, stde, and all pairwise/triple combos | + +--- + +## Phase 3: Quality ✅ + +Documentation fixes and test coverage gaps. **Complete**. + +### Documentation + +| # | Item | Status | +|---|------|--------| +| 3.1 | Update CONTRIBUTING.md architecture tree | ✅ Done (v0.4.1) | +| 3.2 | Fix algorithms.md opcode count | ✅ Done (v0.4.1) | +| 3.3 | Move nalgebra entry to Done in ADR | ✅ Done (v0.4.1) | +| 3.4 | Update roadmap.md stale `bytecode_tape` paths | ✅ Done (v0.4.1) | + +### Test Coverage + +| # | Item | Status | +|---|------|--------| +| 3.5 | Add `echidna-optim` solver convergence edge-case tests | ✅ Done — near-singular Hessian, 1e6:1 conditioning, saddle avoidance | +| 3.6 | Add `cross_country` full-tape integration test | ✅ Already has 5+ cross-validation tests | +| 3.7 | Add CSE edge-case tests | ✅ Done — deep chains, powi dedup, multi-output preservation | +| 3.8 | Sparse Jacobian reverse-mode auto-selection test | ✅ Done — wide-input map forces reverse path | + +--- + +## Phase 4: Deferred Features + +Valuable features without current demand. Revisit when a concrete use case arises. + +| # | Item | Effort | Revisit when | Source | +|---|------|--------|--------------|--------| +| 4.1 | Indefinite dense STDE (eigendecomposition for indefinite C matrices) | medium | A user needs indefinite C support | [ADR](docs/adr-deferred-work.md) | +| 4.2 | General-K GPU Taylor kernels (beyond K=3) | medium | Need for GPU-accelerated 3rd+ order derivatives | [ADR](docs/adr-deferred-work.md) | +| 4.3 | Chunked GPU Taylor dispatch (exceed 128 MB WebGPU limit) | small | Users hit the buffer limit in practice | [ADR](docs/adr-deferred-work.md) | +| 4.4 | CUDA `laplacian_with_control_gpu_cuda` | small | CUDA users need variance-reduced Laplacian | [ADR](docs/adr-deferred-work.md) | +| 4.5 | `taylor_forward_2nd_batch` in `GpuBackend` trait | small | Multiple backends need to be used generically | [ADR](docs/adr-deferred-work.md) | + +--- + +## Phase 5: Aspirational + +Nice-to-haves with no urgency. Pursue opportunistically or if the relevant area is being actively modified. + +| # | Item | Effort | Impact | +|---|------|--------|--------| +| 5.1 | Decompose `stde.rs` (1409 lines) into sub-modules | medium | medium | +| 5.2 | Add `#![warn(missing_docs)]` and fill gaps | large | medium | +| 5.3 | Bulk-add `#[must_use]` to pure functions (~267 sites) | medium | low | +| 5.4 | Audit `usize` to `u32` casts in GPU paths | small | medium | + +**Caution**: 5.1 risks breaking delicate Taylor jet propagation logic — only pursue if `stde.rs` continues to grow. 5.2 is large (all public items need docs) — consider enabling per-module incrementally. + +--- + +## Blocked + +| Item | Blocker | Action | +|------|---------|--------| +| RUSTSEC-2024-0436 (paste via simba) — unmaintained | Upstream simba must release with paste alternative | Already ignored in `deny.toml`. Monitor simba releases. | + +--- + +## Dependency Bump + +| Item | Current | Latest | Effort | Notes | +|------|---------|--------|--------|-------| +| cudarc | 0.17 | 0.19 | medium | Breaking API changes in 0.18. Defer until GPU backend is actively developed. | + +--- + +## Rejected + +These items were evaluated and explicitly rejected. Rationale is in [docs/adr-deferred-work.md](docs/adr-deferred-work.md). + +- **Constant deduplication** — `FloatBits` orphan rule blocks impl; CSE handles the common case +- **Cross-checkpoint DCE** — contradicts segment isolation design +- **SIMD vectorization** — bottleneck is opcode dispatch, not FP throughput +- **no_std / embedded** — requires ground-up rewrite (heap allocation, thread-local tapes) +- **Source transformation / proc-macro AD** — orthogonal approach, separate project +- **Preaccumulation** — superseded by cross-country Markowitz elimination +- **Trait impl macros for num_traits** — hurts error messages, IDE support, debuggability +- **DiffOperator trait abstraction** — `Estimator` trait already provides needed abstraction + +--- + +## Dependencies Between Phases + +``` +Phase 0–3 (complete) — all done as of 2026-03-14 +Phase 4 (deferred features) — each item independent; all require active use of base feature +Phase 5 (aspirational) — independent nice-to-haves +``` diff --git a/echidna-optim/tests/solvers.rs b/echidna-optim/tests/solvers.rs index b92f368..8405b49 100644 --- a/echidna-optim/tests/solvers.rs +++ b/echidna-optim/tests/solvers.rs @@ -651,3 +651,254 @@ fn tape_objective_counts_evals() { let _ = lbfgs(&mut obj, &x0, &LbfgsConfig::default()); assert!(obj.func_evals() > 0, "should have counted evaluations"); } + +// ============================================================ +// Edge cases: near-singular, highly ill-conditioned, saddle +// ============================================================ + +/// f(x,y) = x^4 + y^2. Hessian at origin has zero eigenvalue in x. +struct NearSingularHessian; + +impl Objective for NearSingularHessian { + fn dim(&self) -> usize { + 2 + } + + fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec) { + let f = x[0].powi(4) + x[1] * x[1]; + let g = vec![4.0 * x[0].powi(3), 2.0 * x[1]]; + (f, g) + } + + fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec, Vec>) { + let (f, g) = self.eval_grad(x); + let h = vec![vec![12.0 * x[0] * x[0], 0.0], vec![0.0, 2.0]]; + (f, g, h) + } + + fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec, Vec) { + let g = vec![4.0 * x[0].powi(3), 2.0 * x[1]]; + let hv = vec![12.0 * x[0] * x[0] * v[0], 2.0 * v[1]]; + (g, hv) + } +} + +/// f(x,y) = 0.5*(1e6*x^2 + y^2). Highly ill-conditioned (1e6:1 ratio). +struct HighlyIllConditioned; + +impl Objective for HighlyIllConditioned { + fn dim(&self) -> usize { + 2 + } + + fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec) { + let f = 0.5 * (1e6 * x[0] * x[0] + x[1] * x[1]); + let g = vec![1e6 * x[0], x[1]]; + (f, g) + } + + fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec, Vec>) { + let (f, g) = self.eval_grad(x); + let h = vec![vec![1e6, 0.0], vec![0.0, 1.0]]; + (f, g, h) + } + + fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec, Vec) { + let g = vec![1e6 * x[0], x[1]]; + let hv = vec![1e6 * v[0], v[1]]; + (g, hv) + } +} + +/// f(x,y) = x^2 - y^2 + y^4. Saddle at origin, minima at y = ±1/√2. +struct SaddleNearby; + +impl Objective for SaddleNearby { + fn dim(&self) -> usize { + 2 + } + + fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec) { + let f = x[0] * x[0] - x[1] * x[1] + x[1].powi(4); + let g = vec![2.0 * x[0], -2.0 * x[1] + 4.0 * x[1].powi(3)]; + (f, g) + } + + fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec, Vec>) { + let (f, g) = self.eval_grad(x); + let h = vec![vec![2.0, 0.0], vec![0.0, -2.0 + 12.0 * x[1] * x[1]]]; + (f, g, h) + } + + fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec, Vec) { + let g = vec![2.0 * x[0], -2.0 * x[1] + 4.0 * x[1].powi(3)]; + let hv = vec![2.0 * v[0], (-2.0 + 12.0 * x[1] * x[1]) * v[1]]; + (g, hv) + } +} + +#[test] +fn newton_near_singular_hessian() { + let mut obj = NearSingularHessian; + let config = NewtonConfig { + convergence: ConvergenceParams { + max_iter: 100, + ..Default::default() + }, + ..Default::default() + }; + let result = newton(&mut obj, &[1.0, 1.0], &config); + + // Should converge to origin despite singular Hessian there + assert!( + matches!( + result.termination, + TerminationReason::GradientNorm | TerminationReason::StepSize + ), + "Newton near-singular: terminated with {:?}", + result.termination + ); + assert!(result.x[0].abs() < 1e-2, "x[0] = {}", result.x[0]); + assert!(result.x[1].abs() < 1e-6, "x[1] = {}", result.x[1]); +} + +#[test] +fn lbfgs_near_singular_hessian() { + let mut obj = NearSingularHessian; + let config = LbfgsConfig { + convergence: ConvergenceParams { + max_iter: 200, + ..Default::default() + }, + ..Default::default() + }; + let result = lbfgs(&mut obj, &[1.0, 1.0], &config); + + assert!( + matches!( + result.termination, + TerminationReason::GradientNorm | TerminationReason::StepSize + ), + "L-BFGS near-singular: terminated with {:?}", + result.termination + ); + assert!(result.x[0].abs() < 1e-2, "x[0] = {}", result.x[0]); + assert!(result.x[1].abs() < 1e-6, "x[1] = {}", result.x[1]); +} + +#[test] +fn newton_highly_ill_conditioned() { + let mut obj = HighlyIllConditioned; + let config = NewtonConfig::default(); + let result = newton(&mut obj, &[1.0, 1.0], &config); + + // Newton with exact Hessian handles even extreme conditioning + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_eq!(result.iterations, 1); + assert_near_origin(&result, 1e-8); +} + +#[test] +fn lbfgs_highly_ill_conditioned() { + let mut obj = HighlyIllConditioned; + let config = LbfgsConfig { + convergence: ConvergenceParams { + max_iter: 500, + ..Default::default() + }, + ..Default::default() + }; + let result = lbfgs(&mut obj, &[1.0, 1.0], &config); + + assert_eq!( + result.termination, + TerminationReason::GradientNorm, + "L-BFGS 1e6:1 ill-conditioned: terminated with {:?}", + result.termination + ); + assert_near_origin(&result, 1e-3); +} + +#[test] +fn trust_region_highly_ill_conditioned() { + let mut obj = HighlyIllConditioned; + let config = TrustRegionConfig { + convergence: ConvergenceParams { + max_iter: 500, + ..Default::default() + }, + ..Default::default() + }; + let result = trust_region(&mut obj, &[1.0, 1.0], &config); + + assert_eq!( + result.termination, + TerminationReason::GradientNorm, + "Trust-region 1e6:1 ill-conditioned: terminated with {:?}", + result.termination + ); + assert_near_origin(&result, 1e-3); +} + +#[test] +fn newton_avoids_saddle() { + // Start near the saddle at origin but offset in y. Newton should converge + // to a local minimum at (0, ±1/√2), not the saddle. + let mut obj = SaddleNearby; + let config = NewtonConfig { + convergence: ConvergenceParams { + max_iter: 100, + ..Default::default() + }, + ..Default::default() + }; + let result = newton(&mut obj, &[0.1, 0.5], &config); + + assert!( + matches!( + result.termination, + TerminationReason::GradientNorm | TerminationReason::StepSize + ), + "Newton saddle: terminated with {:?}", + result.termination + ); + // Should be at x=0, |y| ≈ 1/√2 ≈ 0.7071 + assert!(result.x[0].abs() < 1e-4, "x[0] = {}", result.x[0]); + let y_expected = 1.0 / 2.0_f64.sqrt(); + assert!( + (result.x[1].abs() - y_expected).abs() < 1e-4, + "|y| = {}, expected {}", + result.x[1].abs(), + y_expected + ); +} + +#[test] +fn trust_region_avoids_saddle() { + let mut obj = SaddleNearby; + let config = TrustRegionConfig { + convergence: ConvergenceParams { + max_iter: 200, + ..Default::default() + }, + ..Default::default() + }; + let result = trust_region(&mut obj, &[0.1, 0.5], &config); + + assert!( + matches!( + result.termination, + TerminationReason::GradientNorm | TerminationReason::StepSize + ), + "Trust-region saddle: terminated with {:?}", + result.termination + ); + assert!(result.x[0].abs() < 1e-4, "x[0] = {}", result.x[0]); + let y_expected = 1.0 / 2.0_f64.sqrt(); + assert!( + (result.x[1].abs() - y_expected).abs() < 1e-4, + "|y| = {}, expected {}", + result.x[1].abs(), + y_expected + ); +} diff --git a/tests/sparse_jacobian.rs b/tests/sparse_jacobian.rs index 9a31252..c49a2e2 100644 --- a/tests/sparse_jacobian.rs +++ b/tests/sparse_jacobian.rs @@ -196,6 +196,66 @@ fn sparse_hessian_with_pattern_precomputed() { } } +#[test] +fn sparse_jacobian_auto_selects_reverse() { + // 5 inputs -> 2 outputs, both outputs depend on all inputs. + // Column coloring is expensive (all columns share both rows), + // row coloring is cheap (only 2 rows) -> auto-selection picks reverse. + fn wide_input_map(x: &[T]) -> Vec { + vec![ + x[0] + x[1] + x[2] + x[3] + x[4], + x[0] * x[1] * x[2] * x[3] * x[4], + ] + } + + let x = [1.0_f64, 2.0, 0.5, 1.5, 3.0]; + let (mut tape, _) = record_multi(|v| wide_input_map(v), &x); + + // Verify the coloring structure actually favors reverse mode. + tape.forward(&x); + let pattern = tape.detect_jacobian_sparsity(); + let (_, num_col_colors) = echidna::sparse::column_coloring(&pattern); + let (_, num_row_colors) = echidna::sparse::row_coloring(&pattern); + assert!( + num_row_colors < num_col_colors, + "test precondition: need row_colors ({}) < col_colors ({}) to force reverse", + num_row_colors, + num_col_colors + ); + + // Auto-selection should pick reverse mode for this wide-input function. + let (_, pat_auto, jac_auto) = tape.sparse_jacobian(&x); + + // Explicit reverse should give the same result. + let (_, pat_rev, jac_rev) = tape.sparse_jacobian_reverse(&x); + + assert_eq!(pat_auto.nnz(), pat_rev.nnz()); + for k in 0..jac_auto.len() { + assert!( + (jac_auto[k] - jac_rev[k]).abs() < 1e-10, + "auto vs reverse mismatch at k={}: auto={}, rev={}", + k, + jac_auto[k], + jac_rev[k] + ); + } + + // Also verify against dense Jacobian for correctness. + let dense_jac = tape.jacobian(&x); + for (k, (&row, &col)) in pat_auto.rows.iter().zip(pat_auto.cols.iter()).enumerate() { + let r = row as usize; + let c = col as usize; + assert!( + (jac_auto[k] - dense_jac[r][c]).abs() < 1e-10, + "auto vs dense mismatch at ({}, {}): auto={}, dense={}", + r, + c, + jac_auto[k], + dense_jac[r][c] + ); + } +} + #[test] fn api_sparse_jacobian() { let x = vec![1.0_f64, 2.0, 3.0]; diff --git a/tests/tape_optimization.rs b/tests/tape_optimization.rs index 7751a67..02a9ddc 100644 --- a/tests/tape_optimization.rs +++ b/tests/tape_optimization.rs @@ -4,7 +4,7 @@ #![cfg(feature = "bytecode")] use approx::assert_relative_eq; -use echidna::{record, record_multi, BReverse}; +use echidna::{record, record_multi, BReverse, Scalar}; use num_traits::Float; // ── Constant folding ── @@ -642,6 +642,109 @@ fn algebraic_rosenbrock() { } } +// ── CSE edge cases ── + +#[test] +fn cse_deep_chains() { + // Multi-level CSE: a=x*y, b=x*y (CSE), c=a+z, d=b+z (should also CSE). + let (mut tape, val) = record( + |x| { + let a = x[0] * x[1]; + let b = x[0] * x[1]; // same as a + let c = a + x[2]; + let d = b + x[2]; // same as c after first CSE pass + c + d + }, + &[2.0_f64, 3.0, 1.0], + ); + + assert_relative_eq!(val, 14.0, max_relative = 1e-12); // 2*(2*3 + 1) = 14 + let ops_before = tape.num_ops(); + + tape.cse(); + let ops_after = tape.num_ops(); + + assert!( + ops_after < ops_before, + "deep CSE should reduce tape: before={}, after={}", + ops_before, + ops_after + ); + + // Gradient: f = 2*(x*y + z) + // df/dx = 2*y = 6, df/dy = 2*x = 4, df/dz = 2 + let g = tape.gradient(&[2.0, 3.0, 1.0]); + assert_relative_eq!(g[0], 6.0, max_relative = 1e-12); + assert_relative_eq!(g[1], 4.0, max_relative = 1e-12); + assert_relative_eq!(g[2], 2.0, max_relative = 1e-12); +} + +#[test] +fn cse_powi_dedup_and_distinct() { + // x.powi(3) computed twice should be deduplicated. + // x.powi(2) vs x.powi(3) should NOT be merged. + let (mut tape, val) = record( + |x| { + let a = x[0].powi(3); + let b = x[0].powi(3); // same as a — should be deduped + let c = x[0].powi(2); // different exponent — must NOT merge + a + b + c + }, + &[2.0_f64], + ); + + // 2^3 + 2^3 + 2^2 = 8 + 8 + 4 = 20 + assert_relative_eq!(val, 20.0, max_relative = 1e-12); + let ops_before = tape.num_ops(); + + tape.cse(); + let ops_after = tape.num_ops(); + + assert!( + ops_after < ops_before, + "CSE should dedup identical powi: before={}, after={}", + ops_before, + ops_after + ); + + // f(x) = 2*x^3 + x^2, f'(x) = 6*x^2 + 2*x = 6*4 + 4 = 28 + let g = tape.gradient(&[2.0]); + assert_relative_eq!(g[0], 28.0, max_relative = 1e-12); +} + +#[test] +fn cse_preserves_multi_output() { + // Multi-output function with shared subexpressions: CSE must preserve + // correctness for all outputs. + fn shared_sub(x: &[T]) -> Vec { + let common = x[0] * x[1]; // shared + let common2 = x[0] * x[1]; // duplicate of common + vec![common + x[2], common2 * x[2]] + } + + let x = [2.0_f64, 3.0, 4.0]; + let (mut tape, values) = record_multi(|v| shared_sub(v), &x); + + assert_relative_eq!(values[0], 10.0, max_relative = 1e-12); // 2*3 + 4 + assert_relative_eq!(values[1], 24.0, max_relative = 1e-12); // 2*3 * 4 + + let jac_before = tape.jacobian(&x); + + tape.cse(); + + let jac_after = tape.jacobian(&x); + for i in 0..2 { + for j in 0..3 { + assert_relative_eq!( + jac_before[i][j], + jac_after[i][j], + max_relative = 1e-12, + epsilon = 1e-14 + ); + } + } +} + // ── Targeted DCE ── #[test]