diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4c5d80f..4d7a4eb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -117,6 +117,30 @@ jobs:
       - name: Test (stde)
         run: cargo test --features stde
 
+      - name: Build (bytecode,taylor)
+        run: cargo build --features bytecode,taylor
+
+      - name: Test (bytecode,taylor)
+        run: cargo test --features bytecode,taylor
+
+      - name: Build (bytecode,stde)
+        run: cargo build --features bytecode,stde
+
+      - name: Test (bytecode,stde)
+        run: cargo test --features bytecode,stde
+
+      - name: Build (taylor,stde)
+        run: cargo build --features taylor,stde
+
+      - name: Test (taylor,stde)
+        run: cargo test --features taylor,stde
+
+      - name: Build (bytecode,taylor,stde)
+        run: cargo build --features bytecode,taylor,stde
+
+      - name: Test (bytecode,taylor,stde)
+        run: cargo test --features bytecode,taylor,stde
+
   coverage:
     name: Coverage
     runs-on: blacksmith-2vcpu-ubuntu-2404
diff --git a/CLEANUP_PLAN.md b/CLEANUP_PLAN.md
index 092151c..f62a7a1 100644
--- a/CLEANUP_PLAN.md
+++ b/CLEANUP_PLAN.md
@@ -2,6 +2,8 @@
 
 **Generated**: 2026-03-13 | **Project**: echidna v0.4.0 | **LOC**: ~23,400 source, ~15,700 tests
 
+> **Status (2026-03-14)**: Phases 0–3 are complete. All safety comments, lint suppression comments, CI gaps, documentation fixes, and test coverage gaps have been addressed. Remaining work is in Phases 4+ (deferred features and aspirational improvements). See ROADMAP.md for the current state.
+
 ## Executive Summary
 
 Three specialist agents (Infrastructure, Design, Documentation) reviewed the echidna codebase. The project is in strong shape — zero lint errors, zero dead code, zero debug artifacts, clean architecture with correct dependency direction, and comprehensive test coverage across 37 integration test files.
diff --git a/ROADMAP.md b/ROADMAP.md
new file mode 100644
index 0000000..961c759
--- /dev/null
+++ b/ROADMAP.md
@@ -0,0 +1,138 @@
+# Roadmap
+
+**Version**: v0.4.1+ | **Last updated**: 2026-03-14
+
+All core roadmap items (R1–R13) are complete. This document captures forward-looking work: cleanup backlog, infrastructure gaps, deferred features, and aspirational improvements.
+
+For the historical implementation log, see [docs/roadmap.md](docs/roadmap.md). For deferred/rejected rationale, see [docs/adr-deferred-work.md](docs/adr-deferred-work.md).
+
+---
+
+## Phase 0: Foundation ✅
+
+Safety and compliance items with no dependencies. **Complete** (v0.4.1).
+
+| # | Item | Status |
+|---|------|--------|
+| 0.1 | Add SAFETY comments to 13 unsafe blocks | ✅ Done |
+| 0.2 | Add explanatory comments to 8 `#[allow]` suppressions | ✅ Done |
+
+---
+
+## Phase 1: Cleanup ✅
+
+Code duplication consolidation. **Complete** — all items were already addressed in v0.4.1 codebase review.
+
+| # | Item | Status |
+|---|------|--------|
+| 1.1 | Consolidate `greedy_coloring` → delegate to `greedy_distance1_coloring` | ✅ Already delegates |
+| 1.2 | Consolidate `sparse_hessian` → call `sparse_hessian_with_pattern` | ✅ Already a wrapper |
+| 1.3 | Extract shared opcode dispatch from `forward`/`forward_into` | ✅ Uses `forward_dispatch` helper |
+| 1.4 | Consolidate `column_coloring`/`row_coloring` → generic helper | ✅ Delegates to `intersection_graph_coloring` |
+| 1.5 | Extract helper from `GpuTapeData::from_tape`/`from_tape_f64_lossy` | ✅ Shares `build_from_tape` |
+
+---
+
+## Phase 2: Infrastructure ✅
+
+CI and workflow gaps. **Complete**.
+
+| # | Item | Status |
+|---|------|--------|
+| 2.1 | Add `diffop` feature to CI test and lint jobs | ✅ Done (v0.4.1) |
+| 2.2 | Add `parallel` feature to `publish.yml` pre-publish validation | ✅ Done (v0.4.1) |
+| 2.3 | Expand MSRV job to test key feature combinations | ✅ Done — tests bytecode, taylor, stde, and all pairwise/triple combos |
+
+---
+
+## Phase 3: Quality ✅
+
+Documentation fixes and test coverage gaps. **Complete**.
+
+### Documentation
+
+| # | Item | Status |
+|---|------|--------|
+| 3.1 | Update CONTRIBUTING.md architecture tree | ✅ Done (v0.4.1) |
+| 3.2 | Fix algorithms.md opcode count | ✅ Done (v0.4.1) |
+| 3.3 | Move nalgebra entry to Done in ADR | ✅ Done (v0.4.1) |
+| 3.4 | Update roadmap.md stale `bytecode_tape` paths | ✅ Done (v0.4.1) |
+
+### Test Coverage
+
+| # | Item | Status |
+|---|------|--------|
+| 3.5 | Add `echidna-optim` solver convergence edge-case tests | ✅ Done — near-singular Hessian, 1e6:1 conditioning, saddle avoidance |
+| 3.6 | Add `cross_country` full-tape integration test | ✅ Already has 5+ cross-validation tests |
+| 3.7 | Add CSE edge-case tests | ✅ Done — deep chains, powi dedup, multi-output preservation |
+| 3.8 | Sparse Jacobian reverse-mode auto-selection test | ✅ Done — wide-input map forces reverse path |
+
+---
+
+## Phase 4: Deferred Features
+
+Valuable features without current demand. Revisit when a concrete use case arises.
+
+| # | Item | Effort | Revisit when | Source |
+|---|------|--------|--------------|--------|
+| 4.1 | Indefinite dense STDE (eigendecomposition for indefinite C matrices) | medium | A user needs indefinite C support | [ADR](docs/adr-deferred-work.md) |
+| 4.2 | General-K GPU Taylor kernels (beyond K=3) | medium | Need for GPU-accelerated 3rd+ order derivatives | [ADR](docs/adr-deferred-work.md) |
+| 4.3 | Chunked GPU Taylor dispatch (exceed 128 MB WebGPU limit) | small | Users hit the buffer limit in practice | [ADR](docs/adr-deferred-work.md) |
+| 4.4 | CUDA `laplacian_with_control_gpu_cuda` | small | CUDA users need variance-reduced Laplacian | [ADR](docs/adr-deferred-work.md) |
+| 4.5 | `taylor_forward_2nd_batch` in `GpuBackend` trait | small | Multiple backends need to be used generically | [ADR](docs/adr-deferred-work.md) |
+
+---
+
+## Phase 5: Aspirational
+
+Nice-to-haves with no urgency. Pursue opportunistically or if the relevant area is being actively modified.
+
+| # | Item | Effort | Impact |
+|---|------|--------|--------|
+| 5.1 | Decompose `stde.rs` (1409 lines) into sub-modules | medium | medium |
+| 5.2 | Add `#![warn(missing_docs)]` and fill gaps | large | medium |
+| 5.3 | Bulk-add `#[must_use]` to pure functions (~267 sites) | medium | low |
+| 5.4 | Audit `usize` to `u32` casts in GPU paths | small | medium |
+
+**Caution**: 5.1 risks breaking delicate Taylor jet propagation logic — only pursue if `stde.rs` continues to grow. 5.2 is large (all public items need docs) — consider enabling per-module incrementally.
+
+---
+
+## Blocked
+
+| Item | Blocker | Action |
+|------|---------|--------|
+| RUSTSEC-2024-0436 (paste via simba) — unmaintained | Upstream simba must release with paste alternative | Already ignored in `deny.toml`. Monitor simba releases. |
+
+---
+
+## Dependency Bump
+
+| Item | Current | Latest | Effort | Notes |
+|------|---------|--------|--------|-------|
+| cudarc | 0.17 | 0.19 | medium | Breaking API changes in 0.18. Defer until GPU backend is actively developed. |
+
+---
+
+## Rejected
+
+These items were evaluated and explicitly rejected. Rationale is in [docs/adr-deferred-work.md](docs/adr-deferred-work.md).
+
+- **Constant deduplication** — `FloatBits` orphan rule blocks impl; CSE handles the common case
+- **Cross-checkpoint DCE** — contradicts segment isolation design
+- **SIMD vectorization** — bottleneck is opcode dispatch, not FP throughput
+- **no_std / embedded** — requires ground-up rewrite (heap allocation, thread-local tapes)
+- **Source transformation / proc-macro AD** — orthogonal approach, separate project
+- **Preaccumulation** — superseded by cross-country Markowitz elimination
+- **Trait impl macros for num_traits** — hurts error messages, IDE support, debuggability
+- **DiffOperator trait abstraction** — `Estimator` trait already provides needed abstraction
+
+---
+
+## Dependencies Between Phases
+
+```
+Phase 0–3  (complete)        — all done as of 2026-03-14
+Phase 4  (deferred features)  — each item independent; all require active use of base feature
+Phase 5  (aspirational)       — independent nice-to-haves
+```
diff --git a/echidna-optim/tests/solvers.rs b/echidna-optim/tests/solvers.rs
index b92f368..8405b49 100644
--- a/echidna-optim/tests/solvers.rs
+++ b/echidna-optim/tests/solvers.rs
@@ -651,3 +651,254 @@ fn tape_objective_counts_evals() {
     let _ = lbfgs(&mut obj, &x0, &LbfgsConfig::default());
     assert!(obj.func_evals() > 0, "should have counted evaluations");
 }
+
+// ============================================================
+// Edge cases: near-singular, highly ill-conditioned, saddle
+// ============================================================
+
+/// f(x,y) = x^4 + y^2. Hessian at origin has zero eigenvalue in x.
+struct NearSingularHessian;
+
+impl Objective<f64> for NearSingularHessian {
+    fn dim(&self) -> usize {
+        2
+    }
+
+    fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec<f64>) {
+        let f = x[0].powi(4) + x[1] * x[1];
+        let g = vec![4.0 * x[0].powi(3), 2.0 * x[1]];
+        (f, g)
+    }
+
+    fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec<f64>, Vec<Vec<f64>>) {
+        let (f, g) = self.eval_grad(x);
+        let h = vec![vec![12.0 * x[0] * x[0], 0.0], vec![0.0, 2.0]];
+        (f, g, h)
+    }
+
+    fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec<f64>, Vec<f64>) {
+        let g = vec![4.0 * x[0].powi(3), 2.0 * x[1]];
+        let hv = vec![12.0 * x[0] * x[0] * v[0], 2.0 * v[1]];
+        (g, hv)
+    }
+}
+
+/// f(x,y) = 0.5*(1e6*x^2 + y^2). Highly ill-conditioned (1e6:1 ratio).
+struct HighlyIllConditioned;
+
+impl Objective<f64> for HighlyIllConditioned {
+    fn dim(&self) -> usize {
+        2
+    }
+
+    fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec<f64>) {
+        let f = 0.5 * (1e6 * x[0] * x[0] + x[1] * x[1]);
+        let g = vec![1e6 * x[0], x[1]];
+        (f, g)
+    }
+
+    fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec<f64>, Vec<Vec<f64>>) {
+        let (f, g) = self.eval_grad(x);
+        let h = vec![vec![1e6, 0.0], vec![0.0, 1.0]];
+        (f, g, h)
+    }
+
+    fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec<f64>, Vec<f64>) {
+        let g = vec![1e6 * x[0], x[1]];
+        let hv = vec![1e6 * v[0], v[1]];
+        (g, hv)
+    }
+}
+
+/// f(x,y) = x^2 - y^2 + y^4. Saddle at origin, minima at y = ±1/√2.
+struct SaddleNearby;
+
+impl Objective<f64> for SaddleNearby {
+    fn dim(&self) -> usize {
+        2
+    }
+
+    fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec<f64>) {
+        let f = x[0] * x[0] - x[1] * x[1] + x[1].powi(4);
+        let g = vec![2.0 * x[0], -2.0 * x[1] + 4.0 * x[1].powi(3)];
+        (f, g)
+    }
+
+    fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec<f64>, Vec<Vec<f64>>) {
+        let (f, g) = self.eval_grad(x);
+        let h = vec![vec![2.0, 0.0], vec![0.0, -2.0 + 12.0 * x[1] * x[1]]];
+        (f, g, h)
+    }
+
+    fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec<f64>, Vec<f64>) {
+        let g = vec![2.0 * x[0], -2.0 * x[1] + 4.0 * x[1].powi(3)];
+        let hv = vec![2.0 * v[0], (-2.0 + 12.0 * x[1] * x[1]) * v[1]];
+        (g, hv)
+    }
+}
+
+#[test]
+fn newton_near_singular_hessian() {
+    let mut obj = NearSingularHessian;
+    let config = NewtonConfig {
+        convergence: ConvergenceParams {
+            max_iter: 100,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = newton(&mut obj, &[1.0, 1.0], &config);
+
+    // Should converge to origin despite singular Hessian there
+    assert!(
+        matches!(
+            result.termination,
+            TerminationReason::GradientNorm | TerminationReason::StepSize
+        ),
+        "Newton near-singular: terminated with {:?}",
+        result.termination
+    );
+    assert!(result.x[0].abs() < 1e-2, "x[0] = {}", result.x[0]);
+    assert!(result.x[1].abs() < 1e-6, "x[1] = {}", result.x[1]);
+}
+
+#[test]
+fn lbfgs_near_singular_hessian() {
+    let mut obj = NearSingularHessian;
+    let config = LbfgsConfig {
+        convergence: ConvergenceParams {
+            max_iter: 200,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = lbfgs(&mut obj, &[1.0, 1.0], &config);
+
+    assert!(
+        matches!(
+            result.termination,
+            TerminationReason::GradientNorm | TerminationReason::StepSize
+        ),
+        "L-BFGS near-singular: terminated with {:?}",
+        result.termination
+    );
+    assert!(result.x[0].abs() < 1e-2, "x[0] = {}", result.x[0]);
+    assert!(result.x[1].abs() < 1e-6, "x[1] = {}", result.x[1]);
+}
+
+#[test]
+fn newton_highly_ill_conditioned() {
+    let mut obj = HighlyIllConditioned;
+    let config = NewtonConfig::default();
+    let result = newton(&mut obj, &[1.0, 1.0], &config);
+
+    // Newton with exact Hessian handles even extreme conditioning
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_eq!(result.iterations, 1);
+    assert_near_origin(&result, 1e-8);
+}
+
+#[test]
+fn lbfgs_highly_ill_conditioned() {
+    let mut obj = HighlyIllConditioned;
+    let config = LbfgsConfig {
+        convergence: ConvergenceParams {
+            max_iter: 500,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = lbfgs(&mut obj, &[1.0, 1.0], &config);
+
+    assert_eq!(
+        result.termination,
+        TerminationReason::GradientNorm,
+        "L-BFGS 1e6:1 ill-conditioned: terminated with {:?}",
+        result.termination
+    );
+    assert_near_origin(&result, 1e-3);
+}
+
+#[test]
+fn trust_region_highly_ill_conditioned() {
+    let mut obj = HighlyIllConditioned;
+    let config = TrustRegionConfig {
+        convergence: ConvergenceParams {
+            max_iter: 500,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = trust_region(&mut obj, &[1.0, 1.0], &config);
+
+    assert_eq!(
+        result.termination,
+        TerminationReason::GradientNorm,
+        "Trust-region 1e6:1 ill-conditioned: terminated with {:?}",
+        result.termination
+    );
+    assert_near_origin(&result, 1e-3);
+}
+
+#[test]
+fn newton_avoids_saddle() {
+    // Start near the saddle at origin but offset in y. Newton should converge
+    // to a local minimum at (0, ±1/√2), not the saddle.
+    let mut obj = SaddleNearby;
+    let config = NewtonConfig {
+        convergence: ConvergenceParams {
+            max_iter: 100,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = newton(&mut obj, &[0.1, 0.5], &config);
+
+    assert!(
+        matches!(
+            result.termination,
+            TerminationReason::GradientNorm | TerminationReason::StepSize
+        ),
+        "Newton saddle: terminated with {:?}",
+        result.termination
+    );
+    // Should be at x=0, |y| ≈ 1/√2 ≈ 0.7071
+    assert!(result.x[0].abs() < 1e-4, "x[0] = {}", result.x[0]);
+    let y_expected = 1.0 / 2.0_f64.sqrt();
+    assert!(
+        (result.x[1].abs() - y_expected).abs() < 1e-4,
+        "|y| = {}, expected {}",
+        result.x[1].abs(),
+        y_expected
+    );
+}
+
+#[test]
+fn trust_region_avoids_saddle() {
+    let mut obj = SaddleNearby;
+    let config = TrustRegionConfig {
+        convergence: ConvergenceParams {
+            max_iter: 200,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = trust_region(&mut obj, &[0.1, 0.5], &config);
+
+    assert!(
+        matches!(
+            result.termination,
+            TerminationReason::GradientNorm | TerminationReason::StepSize
+        ),
+        "Trust-region saddle: terminated with {:?}",
+        result.termination
+    );
+    assert!(result.x[0].abs() < 1e-4, "x[0] = {}", result.x[0]);
+    let y_expected = 1.0 / 2.0_f64.sqrt();
+    assert!(
+        (result.x[1].abs() - y_expected).abs() < 1e-4,
+        "|y| = {}, expected {}",
+        result.x[1].abs(),
+        y_expected
+    );
+}
diff --git a/tests/sparse_jacobian.rs b/tests/sparse_jacobian.rs
index 9a31252..c49a2e2 100644
--- a/tests/sparse_jacobian.rs
+++ b/tests/sparse_jacobian.rs
@@ -196,6 +196,66 @@ fn sparse_hessian_with_pattern_precomputed() {
     }
 }
 
+#[test]
+fn sparse_jacobian_auto_selects_reverse() {
+    // 5 inputs -> 2 outputs, both outputs depend on all inputs.
+    // Column coloring is expensive (all columns share both rows),
+    // row coloring is cheap (only 2 rows) -> auto-selection picks reverse.
+    fn wide_input_map<T: Scalar>(x: &[T]) -> Vec<T> {
+        vec![
+            x[0] + x[1] + x[2] + x[3] + x[4],
+            x[0] * x[1] * x[2] * x[3] * x[4],
+        ]
+    }
+
+    let x = [1.0_f64, 2.0, 0.5, 1.5, 3.0];
+    let (mut tape, _) = record_multi(|v| wide_input_map(v), &x);
+
+    // Verify the coloring structure actually favors reverse mode.
+    tape.forward(&x);
+    let pattern = tape.detect_jacobian_sparsity();
+    let (_, num_col_colors) = echidna::sparse::column_coloring(&pattern);
+    let (_, num_row_colors) = echidna::sparse::row_coloring(&pattern);
+    assert!(
+        num_row_colors < num_col_colors,
+        "test precondition: need row_colors ({}) < col_colors ({}) to force reverse",
+        num_row_colors,
+        num_col_colors
+    );
+
+    // Auto-selection should pick reverse mode for this wide-input function.
+    let (_, pat_auto, jac_auto) = tape.sparse_jacobian(&x);
+
+    // Explicit reverse should give the same result.
+    let (_, pat_rev, jac_rev) = tape.sparse_jacobian_reverse(&x);
+
+    assert_eq!(pat_auto.nnz(), pat_rev.nnz());
+    for k in 0..jac_auto.len() {
+        assert!(
+            (jac_auto[k] - jac_rev[k]).abs() < 1e-10,
+            "auto vs reverse mismatch at k={}: auto={}, rev={}",
+            k,
+            jac_auto[k],
+            jac_rev[k]
+        );
+    }
+
+    // Also verify against dense Jacobian for correctness.
+    let dense_jac = tape.jacobian(&x);
+    for (k, (&row, &col)) in pat_auto.rows.iter().zip(pat_auto.cols.iter()).enumerate() {
+        let r = row as usize;
+        let c = col as usize;
+        assert!(
+            (jac_auto[k] - dense_jac[r][c]).abs() < 1e-10,
+            "auto vs dense mismatch at ({}, {}): auto={}, dense={}",
+            r,
+            c,
+            jac_auto[k],
+            dense_jac[r][c]
+        );
+    }
+}
+
 #[test]
 fn api_sparse_jacobian() {
     let x = vec![1.0_f64, 2.0, 3.0];
diff --git a/tests/tape_optimization.rs b/tests/tape_optimization.rs
index 7751a67..02a9ddc 100644
--- a/tests/tape_optimization.rs
+++ b/tests/tape_optimization.rs
@@ -4,7 +4,7 @@
 #![cfg(feature = "bytecode")]
 
 use approx::assert_relative_eq;
-use echidna::{record, record_multi, BReverse};
+use echidna::{record, record_multi, BReverse, Scalar};
 use num_traits::Float;
 
 // ── Constant folding ──
@@ -642,6 +642,109 @@ fn algebraic_rosenbrock() {
     }
 }
 
+// ── CSE edge cases ──
+
+#[test]
+fn cse_deep_chains() {
+    // Multi-level CSE: a=x*y, b=x*y (CSE), c=a+z, d=b+z (should also CSE).
+    let (mut tape, val) = record(
+        |x| {
+            let a = x[0] * x[1];
+            let b = x[0] * x[1]; // same as a
+            let c = a + x[2];
+            let d = b + x[2]; // same as c after first CSE pass
+            c + d
+        },
+        &[2.0_f64, 3.0, 1.0],
+    );
+
+    assert_relative_eq!(val, 14.0, max_relative = 1e-12); // 2*(2*3 + 1) = 14
+    let ops_before = tape.num_ops();
+
+    tape.cse();
+    let ops_after = tape.num_ops();
+
+    assert!(
+        ops_after < ops_before,
+        "deep CSE should reduce tape: before={}, after={}",
+        ops_before,
+        ops_after
+    );
+
+    // Gradient: f = 2*(x*y + z)
+    // df/dx = 2*y = 6, df/dy = 2*x = 4, df/dz = 2
+    let g = tape.gradient(&[2.0, 3.0, 1.0]);
+    assert_relative_eq!(g[0], 6.0, max_relative = 1e-12);
+    assert_relative_eq!(g[1], 4.0, max_relative = 1e-12);
+    assert_relative_eq!(g[2], 2.0, max_relative = 1e-12);
+}
+
+#[test]
+fn cse_powi_dedup_and_distinct() {
+    // x.powi(3) computed twice should be deduplicated.
+    // x.powi(2) vs x.powi(3) should NOT be merged.
+    let (mut tape, val) = record(
+        |x| {
+            let a = x[0].powi(3);
+            let b = x[0].powi(3); // same as a — should be deduped
+            let c = x[0].powi(2); // different exponent — must NOT merge
+            a + b + c
+        },
+        &[2.0_f64],
+    );
+
+    // 2^3 + 2^3 + 2^2 = 8 + 8 + 4 = 20
+    assert_relative_eq!(val, 20.0, max_relative = 1e-12);
+    let ops_before = tape.num_ops();
+
+    tape.cse();
+    let ops_after = tape.num_ops();
+
+    assert!(
+        ops_after < ops_before,
+        "CSE should dedup identical powi: before={}, after={}",
+        ops_before,
+        ops_after
+    );
+
+    // f(x) = 2*x^3 + x^2, f'(x) = 6*x^2 + 2*x = 6*4 + 4 = 28
+    let g = tape.gradient(&[2.0]);
+    assert_relative_eq!(g[0], 28.0, max_relative = 1e-12);
+}
+
+#[test]
+fn cse_preserves_multi_output() {
+    // Multi-output function with shared subexpressions: CSE must preserve
+    // correctness for all outputs.
+    fn shared_sub<T: Scalar>(x: &[T]) -> Vec<T> {
+        let common = x[0] * x[1]; // shared
+        let common2 = x[0] * x[1]; // duplicate of common
+        vec![common + x[2], common2 * x[2]]
+    }
+
+    let x = [2.0_f64, 3.0, 4.0];
+    let (mut tape, values) = record_multi(|v| shared_sub(v), &x);
+
+    assert_relative_eq!(values[0], 10.0, max_relative = 1e-12); // 2*3 + 4
+    assert_relative_eq!(values[1], 24.0, max_relative = 1e-12); // 2*3 * 4
+
+    let jac_before = tape.jacobian(&x);
+
+    tape.cse();
+
+    let jac_after = tape.jacobian(&x);
+    for i in 0..2 {
+        for j in 0..3 {
+            assert_relative_eq!(
+                jac_before[i][j],
+                jac_after[i][j],
+                max_relative = 1e-12,
+                epsilon = 1e-14
+            );
+        }
+    }
+}
+
 // ── Targeted DCE ──
 
 #[test]