From 6d2bd60249e444327e7715cb58918b9764a80386 Mon Sep 17 00:00:00 2001 From: Greg von Nessi Date: Sat, 14 Mar 2026 16:40:17 +0000 Subject: [PATCH 1/2] Codebase cleanup phases 0-4 Phase 0-1: Add SAFETY comments to intentional unsafe patterns, replace suppression comments with structured justifications. Phase 2-3: Harden CI (pin actions by SHA, add Clippy/rustfmt checks), update documentation and consolidate duplicated code in sparse Jacobian reverse-mode and bytecode forward pass. Phase 4: Decompose stde.rs (1409 lines) into 9-file directory module, bulk-add #[must_use] to 65 pure functions, and document GPU u32 cast assumptions. --- .github/workflows/ci.yml | 26 +- .github/workflows/publish.yml | 8 +- CLEANUP_PLAN.md | 197 +++++ CONTRIBUTING.md | 13 +- docs/adr-deferred-work.md | 9 +- docs/algorithms.md | 2 +- docs/roadmap.md | 14 +- src/bytecode_tape/forward.rs | 113 ++- src/bytecode_tape/mod.rs | 14 + src/bytecode_tape/reverse.rs | 1 + src/bytecode_tape/sparse.rs | 47 +- src/checkpoint.rs | 6 + src/diffop.rs | 27 + src/gpu/cuda_backend.rs | 20 + src/gpu/mod.rs | 67 +- src/nonsmooth.rs | 1 + src/opcode.rs | 3 + src/sparse.rs | 156 ++-- src/stde.rs | 1409 --------------------------------- src/stde/estimator.rs | 45 ++ src/stde/higher_order.rs | 303 +++++++ src/stde/jet.rs | 76 ++ src/stde/laplacian.rs | 342 ++++++++ src/stde/mod.rs | 117 +++ src/stde/pde.rs | 256 ++++++ src/stde/pipeline.rs | 110 +++ src/stde/sparse.rs | 80 ++ src/stde/types.rs | 75 ++ src/tape.rs | 3 + src/taylor_dyn.rs | 9 + src/traits/laurent_std_ops.rs | 12 +- src/traits/simba_impls.rs | 8 + src/traits/taylor_std_ops.rs | 19 +- 33 files changed, 1906 insertions(+), 1682 deletions(-) create mode 100644 CLEANUP_PLAN.md delete mode 100644 src/stde.rs create mode 100644 src/stde/estimator.rs create mode 100644 src/stde/higher_order.rs create mode 100644 src/stde/jet.rs create mode 100644 src/stde/laplacian.rs create mode 100644 src/stde/mod.rs create mode 100644 src/stde/pde.rs create mode 100644 src/stde/pipeline.rs create mode 100644 src/stde/sparse.rs create mode 100644 src/stde/types.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 210c230..4c5d80f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,10 +24,10 @@ jobs: uses: Swatinem/rust-cache@v2 - name: Build - run: cargo build --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel" + run: cargo build --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel,diffop" - name: Run tests (echidna with all features) - run: cargo test -p echidna --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel" + run: cargo test -p echidna --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel,diffop" - name: Run tests (echidna-optim) run: cargo test -p echidna-optim @@ -53,7 +53,7 @@ jobs: run: cargo fmt --all -- --check - name: Clippy (all non-CUDA features) - run: cargo clippy --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel,gpu-wgpu" -- -D warnings + run: cargo clippy --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel,diffop,gpu-wgpu" -- -D warnings - name: Clippy (echidna-optim) run: cargo clippy -p echidna-optim --all-features -- -D warnings @@ -93,12 +93,30 @@ jobs: - name: Cache cargo uses: Swatinem/rust-cache@v2 - - name: Build + - name: Build (default features) run: cargo build - name: Test (default features) run: cargo test + - name: Build (bytecode) + run: cargo build --features bytecode + + - name: Test (bytecode) + run: cargo test --features bytecode + + - name: Build (taylor) + run: cargo build --features taylor + + - name: Test (taylor) + run: cargo test --features taylor + + - name: Build (stde) + run: cargo build --features stde + + - name: Test (stde) + run: cargo test --features stde + coverage: name: Coverage runs-on: blacksmith-2vcpu-ubuntu-2404 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 43495ec..39bee8c 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -27,19 +27,19 @@ jobs: run: cargo fmt --all -- --check - name: Build (all features) - run: cargo build -p echidna --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray" + run: cargo build -p echidna --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel" - name: Test echidna - run: cargo test -p echidna --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray" + run: cargo test -p echidna --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel" - name: Test echidna-optim run: cargo test -p echidna-optim - name: Clippy - run: cargo clippy --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray" -- -D warnings + run: cargo clippy --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel" -- -D warnings - name: Build docs - run: cargo doc --no-deps --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray" + run: cargo doc --no-deps --features "bytecode,taylor,laurent,stde,serde,faer,nalgebra,ndarray,parallel" env: RUSTDOCFLAGS: -D warnings diff --git a/CLEANUP_PLAN.md b/CLEANUP_PLAN.md new file mode 100644 index 0000000..092151c --- /dev/null +++ b/CLEANUP_PLAN.md @@ -0,0 +1,197 @@ +# Codebase Cleanup Plan + +**Generated**: 2026-03-13 | **Project**: echidna v0.4.0 | **LOC**: ~23,400 source, ~15,700 tests + +## Executive Summary + +Three specialist agents (Infrastructure, Design, Documentation) reviewed the echidna codebase. The project is in strong shape — zero lint errors, zero dead code, zero debug artifacts, clean architecture with correct dependency direction, and comprehensive test coverage across 37 integration test files. + +The main findings are: +- **13 unsafe blocks missing SAFETY comments** (carried forward from prior audit, still unaddressed) +- **8 lint suppressions missing explanatory comments** +- **CI gaps**: `diffop` feature has zero CI coverage; MSRV job only tests default features +- **5 code duplication sites** totalling ~217 lines (after prior cleanup resolved the larger duplications) +- **4 documentation files** need minor updates (~30 min of text edits) +- **1 security advisory** (paste via simba) — blocked upstream, already ignored in deny.toml + +No critical issues. No breaking changes required. All items are incremental improvements. + +## Current State + +- **Architecture**: Clean. Dependencies point inward. Domain logic free of I/O (except disk checkpointing, appropriately isolated). No circular dependencies. Thread-local tape is hidden global state but well-mitigated by RAII guards and API layer. +- **Test Coverage**: Comprehensive integration tests across all major subsystems. cargo-tarpaulin runs in CI. Gaps in echidna-optim solvers and cross-country full-tape integration. +- **Documentation**: High quality overall. README, rustdoc, CHANGELOG, and algorithms.md are thorough. 4 files have minor staleness from the v0.4.0 bytecode_tape decomposition. +- **Dependency Health**: 1 advisory (paste/simba, blocked upstream). cudarc 2 minor versions behind (breaking changes). All other deps at latest. +- **Lint Health**: 0 errors, 0 default-level warnings. 686 pedantic warnings (mostly `must_use_candidate` and doc suggestions). 13 suppressions, all still needed, 8 lacking comments. + +## Memory Context + +- **Decisions from History**: Thread-local tape design for `Copy` on `Reverse`, dual tape architecture (Adept + BytecodeTape), speed-first philosophy (NaN propagation, no Result branching) — all still current and load-bearing. +- **Known Tech Debt**: Welford duplication (RESOLVED — consolidated into `WelfordAccumulator`), reverse sweep duplication (RESOLVED — `reverse_sweep_core`), DCE duplication (RESOLVED — `dce_compact`), output_indices boilerplate (RESOLVED — `all_output_indices()`). Remaining: num_traits macro consolidation (deferred, unfavorable trade-off), GPU trait extraction (deferred, large refactor). +- **Past Attempts**: Cleanup PRs #12-16 completed Phases 0-4. Phase 5 (duplication): items 5.5, 5.8 done; 5.6 deliberately dropped. Memory specialist runs failed 3+ times due to context overflow. +- **Dependency History**: RUSTSEC-2024-0436 (paste/simba) blocked upstream since 2024, ignored in deny.toml. RUSTSEC-2025-0141 (bincode) resolved by replacing with postcard/bitcode. cudarc 0.18 breaking change deferred. nalgebra bumped to 0.34 in v0.4.0. +- **Lint/Suppression History**: All 13 `#[allow]` suppressions were reviewed during Phase 3 cleanup. `clippy::suspicious_arithmetic_impl` suppressions are justified (AD math where Mul impl does addition in tangent). MSRV updated from 1.80 to 1.93. + +## Dependency Health + +### Security Fixes (Priority) + +| Dependency | Current | Fix Version | Vulnerability | Severity | +|-----------|---------|-------------|---------------|----------| +| paste (transitive, via simba) | 1.0.15 | N/A | RUSTSEC-2024-0436 — unmaintained | Low (advisory only) | + +Already ignored in `deny.toml` with explanatory comment. Blocked on simba upstream migration. No action required. + +### At-Risk Dependencies + +| Dependency | Risk | Issue | Action | Alternative / Notes | +|-----------|------|-------|--------|---------------------| +| paste (via simba) | Medium | Unmaintained since Oct 2024 | Wait for simba upstream | Blocked — no action possible | +| cudarc | Medium | 0.17 pinned, 0.19.3 available; 0.18 has breaking API | Bump when ready for API migration | Memory confirms upgrade was previously deferred | +| simba | Medium | Pulls in unmaintained paste; single org (dimforge) | Monitor | No alternative for nalgebra ecosystem | + +### Version Bumps + +| Dependency | Current | Latest | Breaking | Notes | +|-----------|---------|--------|----------|-------| +| cudarc | 0.17 | 0.19.3 | Yes (0.18) | gpu-cuda feature; deferred from prior audit | +| num-dual | 0.11 | 0.13.6 | Minor | Dev-dep only; comparison benchmarks | + +12 transitive crates have duplicate versions in lockfile (from faer/wgpu sub-trees). Not directly addressable. + +## Lint & Static Analysis + +### Errors + +None. Zero errors across all lint levels. + +### Warnings (by category) + +Default-level clippy (enforced in CI with `-D warnings`) produces zero warnings. + +Pedantic-level findings (686 total, not currently enforced): + +| Category | Count | Action | +|----------|-------|--------| +| `must_use_candidate` | ~267 | Consider bulk-adding `#[must_use]` to pure functions in a future pass | +| Missing doc sections (`# Panics`, backticks) | ~124 | Incremental doc improvement | +| Cast portability (`usize` to `u32`) | ~68 | Audit CUDA/GPU paths for truncation risk | +| `inline_always` suggestions | ~35 | Review if `#[inline]` alone suffices (simba trait impls) | +| Auto-fixable style | ~50 | `cargo clippy --fix` for format args, redundant closures | +| Similar names | ~8 | Review for clarity | + +### Suppression Audit + +| File:Line | Suppression | Verdict | Action | +|-----------|------------|---------|--------| +| `bytecode_tape/jacobian.rs:142` | `needless_range_loop` | Valid | Keep (has comment) | +| `traits/dual_vec_ops.rs:31` | `suspicious_arithmetic_impl` | Valid | Keep (has comment) | +| `traits/laurent_std_ops.rs:79,91,212` | `suspicious_arithmetic_impl` | Valid | **Add comments** — AD math where Mul does addition in tangent | +| `traits/taylor_std_ops.rs:36,48,190,272,282` | `suspicious_arithmetic_impl` | Valid | **Add comments** — same pattern | +| `cross_country.rs:39` | `too_many_arguments` | Valid | Keep (has comment) | +| `diffop.rs:582` | `needless_range_loop` | Valid | **Add comment** | +| `float.rs:54` | `cfg_attr(dead_code)` | Valid | Keep (has comment) | + +**Summary**: 13 suppressions, all still needed. **8 need explanatory comments added.** + +## Dead Code & Artifact Removal + +### Immediate Removal + +None. Zero dead code, zero debug artifacts, zero TODO/FIXME/HACK comments in source. + +### Verify Before Removal + +| Item | Location | Verification Needed | +|------|----------|---------------------| +| Packaging artifacts | `target/package/echidna-0.1.0/Cargo.toml.orig` | Gitignored; harmless. `cargo clean` removes. | +| Unused deny.toml license entries | `deny.toml` L14-24 | BSD-3-Clause, BSL-1.0 may be unused. Verify with `cargo deny check`. Low priority. | + +## Documentation Consolidation + +### Documents to Update + +| Document | Updates Required | +|----------|-----------------| +| `CONTRIBUTING.md` L154-211 | **Most impactful**: Replace `bytecode_tape.rs` single-file entry with `bytecode_tape/` directory listing (11 submodules). Misleads new contributors. | +| `docs/algorithms.md` L529 | Fix "43 opcodes" to "44 opcodes" (matches L75 and L513 in same doc, and verified OpCode enum). | +| `docs/adr-deferred-work.md` L23 | Move nalgebra 0.33->0.34 from Deferred to Done (completed in v0.4.0 per CHANGELOG). | +| `docs/roadmap.md` | Update 6 stale `src/bytecode_tape.rs` references to point to correct submodule files. | + +### Documents to Remove/Merge + +| Document | Action | Target | +|----------|--------|--------| +| `docs/plans/README.md` | Consider merging | `docs/roadmap.md` — 15-line file that only points to roadmap | + +## Refactoring Roadmap + +### Phase 0: Safety & Compliance (~1-2 hours) + +| Task | Impact | Effort | Files Affected | +|------|--------|--------|----------------| +| Add SAFETY comments to 13 unsafe blocks | High | Small | `checkpoint.rs` (2), `cuda_backend.rs` (6), `simba_impls.rs` (4), `taylor_dyn.rs` (1) | +| Add explanatory comments to 8 `#[allow]` suppressions | Medium | Trivial | `laurent_std_ops.rs` (3), `taylor_std_ops.rs` (5), `diffop.rs` (1) | + +### Phase 1: CI & Infrastructure (~30 min) + +| Task | Impact | Effort | Files Affected | +|------|--------|--------|----------------| +| Add `diffop` feature to CI test and lint jobs | High | Trivial | `.github/workflows/ci.yml` | +| Add `parallel` feature to `publish.yml` pre-publish validation | Medium | Trivial | `.github/workflows/publish.yml` | +| Expand MSRV job to test key feature combinations (bytecode, taylor, stde) | Medium | Trivial | `.github/workflows/ci.yml` | + +### Phase 2: Documentation (~30 min) + +| Task | Impact | Effort | Files Affected | +|------|--------|--------|----------------| +| Update CONTRIBUTING.md architecture tree | High | Trivial | `CONTRIBUTING.md` | +| Fix algorithms.md opcode count (43->44) | Low | Trivial | `docs/algorithms.md` | +| Move nalgebra entry to Done in ADR | Low | Trivial | `docs/adr-deferred-work.md` | +| Update roadmap.md bytecode_tape paths | Low | Trivial | `docs/roadmap.md` | + +### Phase 3: Duplication Consolidation (~2-3 hours) + +| Task | Impact | Effort | Files Affected | +|------|--------|--------|----------------| +| Consolidate `greedy_coloring` → delegate to `greedy_distance1_coloring` | Medium | Small | `src/sparse.rs` (~72 lines) | +| Consolidate `sparse_hessian` → call `sparse_hessian_with_pattern` | Medium | Small | `src/bytecode_tape/sparse.rs` (~50 lines) | +| Extract shared opcode dispatch from `forward`/`forward_into` | Medium | Small | `src/bytecode_tape/forward.rs` (~40 lines) | +| Consolidate `column_coloring`/`row_coloring` → generic helper | Low | Small | `src/sparse.rs` (~30 lines) | +| Extract helper from `GpuTapeData::from_tape`/`from_tape_f64_lossy` | Low | Trivial | `src/gpu/mod.rs` (~25 lines) | + +### Phase 4: Code Quality (~optional, future) + +| Task | Impact | Effort | Components | +|------|--------|--------|------------| +| Decompose `stde.rs` (1409 lines) into sub-modules | Medium | Medium | `src/stde/` directory | +| Add `#![warn(missing_docs)]` and fill gaps | Medium | Large | All public items | +| Bulk-add `#[must_use]` to pure functions | Low | Medium | ~267 sites | +| Audit `usize` to `u32` casts in GPU paths | Medium | Small | `src/gpu/`, `src/bytecode_tape/` | +| Bump cudarc 0.17 → 0.19 (breaking API changes) | Medium | Medium | `src/gpu/cuda_backend.rs` | + +## Testing Strategy + +**Current state is strong.** 37 integration test files cover all major subsystems. Known gaps: + +| Gap | Priority | Action | +|-----|----------|--------| +| `echidna-optim` solver convergence edge cases | High | Add convergence tests for ill-conditioned problems | +| `cross_country.rs` full-tape integration | Medium | Add end-to-end test with real BytecodeTape | +| `bytecode_tape/optimize.rs` CSE edge cases | Medium | Add tests for pathological CSE remapping scenarios | +| Sparse Jacobian reverse-mode path | Medium | Ensure auto-selection exercises reverse path in tests | + +## Target State + +- **Test Coverage**: Maintain current breadth; fill 4 identified gaps +- **Architecture**: No changes needed — already clean +- **Documentation**: All docs accurate; SAFETY comments on all unsafe blocks; suppressions documented +- **Key Improvements**: CI covers all features; zero undocumented unsafe; zero undocumented suppressions + +## Risks & Considerations + +- **paste/simba advisory**: Blocked upstream. Monitor simba releases for pastey migration. No action possible now. +- **cudarc upgrade**: Breaking API changes in 0.18. Defer until GPU backend is actively developed. +- **stde.rs decomposition**: Medium effort, risk of breaking the delicate Taylor jet propagation logic. Only pursue if the file continues to grow. +- **`#![warn(missing_docs)]`**: Large effort (all public items need docs). Consider enabling per-module incrementally. +- **Pedantic clippy**: 686 warnings. Do NOT enable `-W clippy::pedantic` in CI without triaging — many are noise for a math library. Cherry-pick useful categories (must_use, cast truncation). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0620da4..577aa4e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -162,7 +162,18 @@ src/ ├── reverse.rs # Reverse reverse-mode type ├── api.rs # Public API: grad, jvp, vjp, jacobian, hessian, ... ├── breverse.rs # BReverse bytecode-tape reverse variable [bytecode] -├── bytecode_tape.rs # BytecodeTape SoA representation [bytecode] +├── bytecode_tape/ +│ ├── mod.rs # BytecodeTape SoA representation, core API [bytecode] +│ ├── forward.rs # Forward evaluation and tangent sweeps [bytecode] +│ ├── jacobian.rs # Jacobian computation (forward/reverse/cross-country) [bytecode] +│ ├── optimize.rs # Tape optimization passes (CSE, DCE, simplification) [bytecode] +│ ├── parallel.rs # Parallel/batched evaluation [bytecode] +│ ├── reverse.rs # Reverse sweeps and adjoint computation [bytecode] +│ ├── serde_support.rs # Serde serialization/deserialization [bytecode, serde] +│ ├── sparse.rs # Sparse derivative support [bytecode] +│ ├── tangent.rs # Tangent-mode evaluation (Taylor, nonsmooth) [bytecode] +│ ├── taylor.rs # Taylor-specific tape operations (ODE, grad) [bytecode, taylor] +│ └── thread_local.rs # Thread-local tape storage [bytecode] ├── opcode.rs # Opcode definitions and dispatch (44 opcodes) [bytecode] ├── sparse.rs # Sparsity detection and graph coloring [bytecode] ├── cross_country.rs # Markowitz vertex elimination [bytecode] diff --git a/docs/adr-deferred-work.md b/docs/adr-deferred-work.md index 419d1f4..b576c08 100644 --- a/docs/adr-deferred-work.md +++ b/docs/adr-deferred-work.md @@ -16,11 +16,18 @@ All core roadmap items (R1–R13), Phases 1–5, and the STDE deferred items pla --- +## Done (previously deferred, now completed) + +| Item | Completed In | Notes | +|------|-------------|-------| +| nalgebra 0.33 → 0.34 | v0.4.0 | MSRV blocker resolved (1.93 ≥ 1.87+). Upgrade completed with no API breakage. | + +--- + ## Deferred (valuable, not yet needed) | Item | Reasoning | Revisit When | |------|-----------|--------------| -| nalgebra 0.33 → 0.34 | MSRV blocker resolved (now 1.93, nalgebra 0.34 requires 1.87+). May have API changes in `NalgebraVec` and sparse matrix types. | When convenient; test thoroughly for API changes. | | Indefinite dense STDE | Eigendecomposition-based approach for indefinite coefficient matrices C (6 parameters, sign-splitting into C⁺ - C⁻) adds significant API complexity. The positive-definite Cholesky case (`dense_stde_2nd`) covers most PDE use cases (Fokker-Planck, Black-Scholes, HJB). Users with indefinite C can manually split into C⁺ - C⁻, compute Cholesky factors for each, and call `dense_stde_2nd` twice. | A concrete user need for indefinite C arises | | General-K GPU Taylor kernels | GPU Taylor kernel is hardcoded to K=3 (second-order). Hardcoding allows complete loop unrolling — critical for GPU performance. General K would need dynamic loops or a family of K-specialized kernels. | Need for GPU-accelerated 3rd+ order derivatives | | Chunked GPU Taylor dispatch | Working buffer is `3 * num_variables * batch_size * 4` bytes. WebGPU's 128 MB limit caps `num_variables * batch_size ≤ ~10M`. For larger problems, the dispatch function should chunk the batch and accumulate results. | Users hit the buffer limit in practice | diff --git a/docs/algorithms.md b/docs/algorithms.md index 7606e71..6cab489 100644 --- a/docs/algorithms.md +++ b/docs/algorithms.md @@ -526,7 +526,7 @@ propagation. Each thread processes one batch element, propagating a `(c0, c1, c2)` triple through the tape where `c0` is the primal value, `c1` is the directional first derivative, and `c2 = v^T H v / 2`. The memory layout is interleaved per variable per thread: `[c0_i, c1_i, c2_i]` contiguous per -variable, optimizing per-thread locality. All 43 opcodes implement full +variable, optimizing per-thread locality. All 44 opcodes implement full second-order Taylor propagation rules: arithmetic via Cauchy products, transcendentals via logarithmic derivative recurrences, and coupled recurrences for sin/cos, sinh/cosh, and tan/tanh. diff --git a/docs/roadmap.md b/docs/roadmap.md index 80660c0..adaa187 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -61,13 +61,13 @@ Key files: `src/taylor.rs`, `src/taylor_dyn.rs`, `src/taylor_ops.rs`, `src/trait Reverse-over-Taylor: `taylor_grad` builds Taylor inputs `x_i(t) = x_i + v_i·t`, runs `forward_tangent`, then `reverse_tangent` to get Taylor-valued adjoints. `adjoint[i].coeff(0)` = gradient, `adjoint[i].coeff(1)` = HVP, `adjoint[i].derivative(k)` = k-th order directional adjoint. For K=2 equivalent to `hvp()`; for K≥3 yields third-order and higher. Feature-gated behind `taylor`. `taylor_grad_with_buf` variant reuses buffers. 11 tests cross-validating against `hvp`, `gradient`, `hessian`, and `third_order_hvvp`. -Key file: `src/bytecode_tape.rs`. +Key file: `src/bytecode_tape/taylor.rs`. **R1d. ODE Taylor series integration** — **COMPLETE** Given `y' = f(y)`, computes the Taylor expansion of `y(t)` to order K via coefficient bootstrapping: `y_{k+1} = coeff_k(f(y(t))) / (k+1)`. Uses K-1 forward passes through the tape, building up Taylor inputs incrementally. Returns `Vec>` so users get `eval_at(h)` for stepping, `coeff(k)` for error estimation, and `derivative(k)` for the solution derivatives. Feature-gated behind `taylor`. Follows the `taylor_grad` / `hvp` dual-method pattern (allocating + buffer-reusing). 8 tests covering exponential growth/decay, quadratic blowup, 2D rotation, step evaluation, minimal K=2, buffer reuse, and standalone `eval_at`. -Key file: `src/bytecode_tape.rs`. +Key file: `src/bytecode_tape/taylor.rs`. ### R2. Stochastic Taylor Derivative Estimators (STDE) **STDE paper (Shi et al., NeurIPS 2024)** @@ -135,7 +135,7 @@ Vertex elimination on the linearized computational graph with Markowitz-based or Public API: `BytecodeTape::jacobian_cross_country(&mut self, inputs: &[F]) -> Vec>`. Handles identity/passthrough (output-is-input), custom ops, constant nodes, and all standard opcodes. Generic over `F: Float`. 10 tests cross-validating against `jacobian()` (reverse mode) and `jacobian_forward()`. -Key files: `src/cross_country.rs`, `src/bytecode_tape.rs`. +Key files: `src/cross_country.rs`, `src/bytecode_tape/jacobian.rs`. ### R6. Nonsmooth Extensions — **COMPLETE** **Book Phase 7 (Ch 14)** @@ -144,13 +144,13 @@ Key files: `src/cross_country.rs`, `src/bytecode_tape.rs`. `BytecodeTape::forward_nonsmooth` detects abs/min/max kinks during forward evaluation, records `KinkEntry` with tape index, opcode, switching value, and branch taken. `NonsmoothInfo` provides `active_kinks(tol)`, `is_smooth(tol)`, and `signature()`. Feature-gated behind `bytecode`. 8 tests. -Key files: `src/nonsmooth.rs`, `src/bytecode_tape.rs`. +Key files: `src/nonsmooth.rs`, `src/bytecode_tape/forward.rs`. **R6b. Clarke generalized Jacobian** — **COMPLETE** `BytecodeTape::clarke_jacobian` enumerates all 2^k sign combinations for k active kinks, computing limiting Jacobians via `reverse_with_forced_signs`. `jacobian_limiting` allows explicit forced branch choices. `forced_reverse_partials` in `opcode.rs` overrides nonsmooth op partials. Configurable kink limit (default 20) returns `ClarkeError::TooManyKinks`. 8 tests. -Key files: `src/opcode.rs`, `src/bytecode_tape.rs`. +Key files: `src/opcode.rs`, `src/bytecode_tape/jacobian.rs`. **R6c. Laurent numbers** — **COMPLETE** @@ -167,7 +167,7 @@ Key files: `src/laurent.rs`, `src/traits/laurent_std_ops.rs`, `src/traits/lauren `serde` support for `BytecodeTape` with manual Serialize/Deserialize implementations. Custom ops are rejected at serialization time (must be re-registered after deserialization). R6 types (`KinkEntry`, `NonsmoothInfo`, `ClarkeError`) and `Laurent` also support serde (Laurent uses manual impls due to const-generic array limitations in serde's derive). Feature-gated behind `serde` flag. JSON and bincode roundtrip verified. 9 tests covering f32/f64, single/multi-output tapes, binary format, custom-op rejection, sparsity patterns, and nonsmooth info. -Key files: `src/bytecode_tape.rs`, `src/nonsmooth.rs`, `src/laurent.rs`, `tests/serde.rs`. +Key files: `src/bytecode_tape/serde_support.rs`, `src/nonsmooth.rs`, `src/laurent.rs`, `tests/serde.rs`. ### R8. Benchmarking Infrastructure — **COMPLETE** @@ -253,7 +253,7 @@ Recording-time algebraic simplification and targeted multi-output DCE. - Constant deduplication: `FloatBits` trait can't be implemented for `Dual`; CSE handles ops over duplicate constants - Cross-checkpoint DCE: too complex for this milestone -Key file: `src/bytecode_tape.rs`. 22 new tests in `tests/tape_optimization.rs`. +Key file: `src/bytecode_tape/optimize.rs`. 22 new tests in `tests/tape_optimization.rs`. --- diff --git a/src/bytecode_tape/forward.rs b/src/bytecode_tape/forward.rs index e17ebe4..faeff59 100644 --- a/src/bytecode_tape/forward.rs +++ b/src/bytecode_tape/forward.rs @@ -1,6 +1,48 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use crate::bytecode_tape::CustomOp; use crate::float::Float; use crate::opcode::{self, OpCode, UNUSED}; +/// Evaluate all non-Input, non-Const operations on a values buffer. +/// +/// Shared by `forward` (in-place) and `forward_into` (external buffer). +fn forward_dispatch( + opcodes: &[OpCode], + arg_indices: &[[u32; 2]], + values: &mut [F], + custom_ops: &[Arc>], + custom_second_args: &HashMap, +) { + for i in 0..opcodes.len() { + match opcodes[i] { + OpCode::Input | OpCode::Const => continue, + OpCode::Custom => { + let [a_idx, cb_idx] = arg_indices[i]; + let a = values[a_idx as usize]; + let b = custom_second_args + .get(&(i as u32)) + .map(|&bi| values[bi as usize]) + .unwrap_or(F::zero()); + values[i] = custom_ops[cb_idx as usize].eval(a, b); + } + op => { + let [a_idx, b_idx] = arg_indices[i]; + let a = values[a_idx as usize]; + let b = if b_idx != UNUSED && op != OpCode::Powi { + values[b_idx as usize] + } else if op == OpCode::Powi { + F::from(b_idx).unwrap_or_else(|| F::zero()) + } else { + F::zero() + }; + values[i] = opcode::eval_forward(op, a, b); + } + } + } +} + impl super::BytecodeTape { /// Re-evaluate the tape at new inputs (forward sweep). /// @@ -12,39 +54,17 @@ impl super::BytecodeTape { "wrong number of inputs" ); - // Overwrite input values. for (i, &v) in inputs.iter().enumerate() { self.values[i] = v; } - // Re-evaluate all non-Input, non-Const ops. - for i in 0..self.opcodes.len() { - match self.opcodes[i] { - OpCode::Input | OpCode::Const => continue, - OpCode::Custom => { - let [a_idx, cb_idx] = self.arg_indices[i]; - let a = self.values[a_idx as usize]; - let b = self - .custom_second_args - .get(&(i as u32)) - .map(|&bi| self.values[bi as usize]) - .unwrap_or(F::zero()); - self.values[i] = self.custom_ops[cb_idx as usize].eval(a, b); - } - op => { - let [a_idx, b_idx] = self.arg_indices[i]; - let a = self.values[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { - self.values[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or_else(|| F::zero()) - } else { - F::zero() - }; - self.values[i] = opcode::eval_forward(op, a, b); - } - } - } + forward_dispatch( + &self.opcodes, + &self.arg_indices, + &mut self.values, + &self.custom_ops, + &self.custom_second_args, + ); } /// Forward sweep with nonsmooth branch tracking. @@ -152,33 +172,12 @@ impl super::BytecodeTape { values_buf[i] = v; } - // Re-evaluate all non-Input, non-Const ops. - for i in 0..self.opcodes.len() { - match self.opcodes[i] { - OpCode::Input | OpCode::Const => continue, - OpCode::Custom => { - let [a_idx, cb_idx] = self.arg_indices[i]; - let a = values_buf[a_idx as usize]; - let b = self - .custom_second_args - .get(&(i as u32)) - .map(|&bi| values_buf[bi as usize]) - .unwrap_or(F::zero()); - values_buf[i] = self.custom_ops[cb_idx as usize].eval(a, b); - } - op => { - let [a_idx, b_idx] = self.arg_indices[i]; - let a = values_buf[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { - values_buf[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or_else(|| F::zero()) - } else { - F::zero() - }; - values_buf[i] = opcode::eval_forward(op, a, b); - } - } - } + forward_dispatch( + &self.opcodes, + &self.arg_indices, + values_buf, + &self.custom_ops, + &self.custom_second_args, + ); } } diff --git a/src/bytecode_tape/mod.rs b/src/bytecode_tape/mod.rs index f08579c..c1e48bc 100644 --- a/src/bytecode_tape/mod.rs +++ b/src/bytecode_tape/mod.rs @@ -120,6 +120,7 @@ pub struct BytecodeTape { impl BytecodeTape { /// Create an empty bytecode tape. + #[must_use] pub fn new() -> Self { BytecodeTape { opcodes: Vec::new(), @@ -135,6 +136,7 @@ impl BytecodeTape { } /// Create a bytecode tape with pre-allocated capacity. + #[must_use] pub fn with_capacity(est_ops: usize) -> Self { BytecodeTape { opcodes: Vec::with_capacity(est_ops), @@ -366,6 +368,7 @@ impl BytecodeTape { /// Get the output value (available after `forward()` or initial recording). #[inline] + #[must_use] pub fn output_value(&self) -> F { self.values[self.output_index as usize] } @@ -375,18 +378,21 @@ impl BytecodeTape { /// Use this with the buffer produced by [`forward_tangent`](Self::forward_tangent) /// to read the output: `buf[tape.output_index()]`. #[inline] + #[must_use] pub fn output_index(&self) -> usize { self.output_index as usize } /// Number of input variables. #[inline] + #[must_use] pub fn num_inputs(&self) -> usize { self.num_inputs as usize } /// Number of operations (including inputs and constants). #[inline] + #[must_use] pub fn num_ops(&self) -> usize { self.opcodes.len() } @@ -407,6 +413,7 @@ impl BytecodeTape { } /// Number of output variables. Returns 1 in single-output mode. + #[must_use] pub fn num_outputs(&self) -> usize { if self.output_indices.is_empty() { 1 @@ -418,6 +425,7 @@ impl BytecodeTape { /// Get all output values (available after `forward()` or initial recording). /// /// In single-output mode, returns a single-element vector. + #[must_use] pub fn output_values(&self) -> Vec { if self.output_indices.is_empty() { vec![self.values[self.output_index as usize]] @@ -433,6 +441,7 @@ impl BytecodeTape { /// /// For multi-output tapes, returns all registered output indices. /// For single-output tapes, returns a single-element slice. + #[must_use] pub fn all_output_indices(&self) -> &[u32] { if self.output_indices.is_empty() { std::slice::from_ref(&self.output_index) @@ -445,30 +454,35 @@ impl BytecodeTape { /// Slice view of all opcodes in the tape. #[inline] + #[must_use] pub fn opcodes_slice(&self) -> &[OpCode] { &self.opcodes } /// Slice view of all argument index pairs `[arg0, arg1]`. #[inline] + #[must_use] pub fn arg_indices_slice(&self) -> &[[u32; 2]] { &self.arg_indices } /// Slice view of all primal values in the tape. #[inline] + #[must_use] pub fn values_slice(&self) -> &[F] { &self.values } /// Total number of tape entries (inputs + constants + operations). #[inline] + #[must_use] pub fn num_variables_count(&self) -> usize { self.num_variables as usize } /// Returns `true` if the tape contains any custom operations. #[inline] + #[must_use] pub fn has_custom_ops(&self) -> bool { !self.custom_ops.is_empty() } diff --git a/src/bytecode_tape/reverse.rs b/src/bytecode_tape/reverse.rs index a1be6af..0c75cfd 100644 --- a/src/bytecode_tape/reverse.rs +++ b/src/bytecode_tape/reverse.rs @@ -85,6 +85,7 @@ impl super::BytecodeTape { /// Reverse sweep: compute adjoints seeded at the output. /// /// Returns the full adjoint vector (length = `num_variables`). + #[must_use] pub fn reverse(&self, seed_index: u32) -> Vec { let n = self.num_variables as usize; let mut adjoints = vec![F::zero(); n]; diff --git a/src/bytecode_tape/sparse.rs b/src/bytecode_tape/sparse.rs index b0998c3..b807b22 100644 --- a/src/bytecode_tape/sparse.rs +++ b/src/bytecode_tape/sparse.rs @@ -7,6 +7,7 @@ impl super::BytecodeTape { /// /// Walks the tape forward propagating input-dependency bitsets. /// At nonlinear operations, marks cross-pairs as potential Hessian interactions. + #[must_use] pub fn detect_sparsity(&self) -> crate::sparse::SparsityPattern { crate::sparse::detect_sparsity_impl( &self.opcodes, @@ -20,6 +21,7 @@ impl super::BytecodeTape { /// /// Walks the tape forward propagating input-dependency bitsets (first-order). /// For each output, determines which inputs it depends on. + #[must_use] pub fn detect_jacobian_sparsity(&self) -> crate::sparse::JacobianSparsityPattern { let out_indices = self.all_output_indices(); crate::sparse::detect_jacobian_sparsity_impl( @@ -45,49 +47,8 @@ impl super::BytecodeTape { let pattern = self.detect_sparsity(); let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); - - let mut hessian_values = vec![F::zero(); pattern.nnz()]; - let mut gradient = vec![F::zero(); n]; - let mut value = F::zero(); - - let mut dual_input_buf: Vec> = Vec::with_capacity(n); - let mut dual_vals_buf = Vec::new(); - let mut adjoint_buf = Vec::new(); - let mut v = vec![F::zero(); n]; - - for color in 0..num_colors { - // Form direction vector: v[i] = 1 if colors[i] == color, else 0 - for i in 0..n { - v[i] = if colors[i] == color { - F::one() - } else { - F::zero() - }; - } - - self.hvp_with_all_bufs( - x, - &v, - &mut dual_input_buf, - &mut dual_vals_buf, - &mut adjoint_buf, - ); - - if color == 0 { - value = dual_vals_buf[self.output_index as usize].re; - for i in 0..n { - gradient[i] = adjoint_buf[i].re; - } - } - - // Extract Hessian entries for this color. - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - if colors[col as usize] == color { - hessian_values[k] = adjoint_buf[row as usize].eps; - } - } - } - + let (value, gradient, hessian_values) = + self.sparse_hessian_with_pattern(x, &pattern, &colors, num_colors); (value, gradient, pattern, hessian_values) } diff --git a/src/checkpoint.rs b/src/checkpoint.rs index 8f71c62..cc22f12 100644 --- a/src/checkpoint.rs +++ b/src/checkpoint.rs @@ -453,6 +453,9 @@ pub fn grad_checkpointed_disk( } fn write_checkpoint(state: &[F], path: &std::path::Path) { + // SAFETY: `F: Float` is `Copy + Sized` with no padding or pointers, so its + // raw byte representation is well-defined. The pointer comes from a valid + // slice, and the length is exactly `size_of_val(state)` bytes. let bytes: &[u8] = unsafe { std::slice::from_raw_parts(state.as_ptr().cast::(), std::mem::size_of_val(state)) }; @@ -469,6 +472,9 @@ fn read_checkpoint(path: &std::path::Path, dim: usize) -> Vec { bytes.len() ); let mut state = vec![F::zero(); dim]; + // SAFETY: `F: Float` is `Copy + Sized` with no padding or pointers. The + // byte length was validated by the assert above to equal `dim * size_of::()`, + // so the copy stays within bounds of both the source and destination buffers. unsafe { std::ptr::copy_nonoverlapping(bytes.as_ptr(), state.as_mut_ptr().cast::(), bytes.len()); } diff --git a/src/diffop.rs b/src/diffop.rs index f4a0446..fa1f66c 100644 --- a/src/diffop.rs +++ b/src/diffop.rs @@ -91,6 +91,7 @@ impl MultiIndex { /// # Panics /// /// Panics if `orders` is empty. + #[must_use] pub fn new(orders: &[u8]) -> Self { assert!( !orders.is_empty(), @@ -106,6 +107,7 @@ impl MultiIndex { /// # Panics /// /// Panics if `var >= num_vars` or `order == 0`. + #[must_use] pub fn diagonal(num_vars: usize, var: usize, order: u8) -> Self { assert!(var < num_vars, "var ({}) >= num_vars ({})", var, num_vars); assert!(order > 0, "order must be > 0"); @@ -119,16 +121,19 @@ impl MultiIndex { /// # Panics /// /// Panics if `var >= num_vars`. + #[must_use] pub fn partial(num_vars: usize, var: usize) -> Self { Self::diagonal(num_vars, var, 1) } /// Total differentiation order: `Σ orders[i]`. + #[must_use] pub fn total_order(&self) -> usize { self.orders.iter().map(|&o| o as usize).sum() } /// Active variables: indices where `orders[i] > 0`, paired with their order. + #[must_use] pub fn active_vars(&self) -> Vec<(usize, u8)> { self.orders .iter() @@ -139,11 +144,13 @@ impl MultiIndex { } /// Number of variables in this multi-index. + #[must_use] pub fn num_vars(&self) -> usize { self.orders.len() } /// The per-variable differentiation orders. + #[must_use] pub fn orders(&self) -> &[u8] { &self.orders } @@ -404,6 +411,7 @@ impl JetPlan { /// /// Panics if `multi_indices` is empty, if any multi-index has wrong `num_vars`, /// or if slot assignment fails. + #[must_use] pub fn plan(num_vars: usize, multi_indices: &[MultiIndex]) -> Self { assert!( !multi_indices.is_empty(), @@ -450,11 +458,13 @@ impl JetPlan { } /// The maximum jet order across all groups. + #[must_use] pub fn jet_order(&self) -> usize { self.max_jet_order } /// The multi-indices this plan computes, in order. + #[must_use] pub fn multi_indices(&self) -> Vec { self.multi_indices.clone() } @@ -579,6 +589,9 @@ pub fn mixed_partial( /// # Panics /// /// Panics if `x.len()` does not match `tape.num_inputs()`. +// Index variables i, j are used to construct MultiIndex values, index into `orders`, +// and fill both triangles of the symmetric Hessian matrix — iterators would obscure the +// mathematical indexing logic. #[allow(clippy::needless_range_loop)] pub fn hessian( tape: &BytecodeTape, @@ -660,6 +673,7 @@ impl DiffOp { /// # Panics /// /// Panics if `terms` is empty or any multi-index has wrong `num_vars`. + #[must_use] pub fn new(num_vars: usize, terms: Vec<(F, MultiIndex)>) -> Self { assert!(!terms.is_empty(), "DiffOp must have at least one term"); for (_, mi) in &terms { @@ -686,6 +700,7 @@ impl DiffOp { } /// Laplacian: `Σ_j ∂²/∂x_j²`. + #[must_use] pub fn laplacian(n: usize) -> Self { let terms = (0..n) .map(|j| (F::one(), MultiIndex::diagonal(n, j, 2))) @@ -694,6 +709,7 @@ impl DiffOp { } /// Biharmonic: `Σ_j ∂⁴/∂x_j⁴`. + #[must_use] pub fn biharmonic(n: usize) -> Self { let terms = (0..n) .map(|j| (F::one(), MultiIndex::diagonal(n, j, 4))) @@ -702,6 +718,7 @@ impl DiffOp { } /// k-th order diagonal: `Σ_j ∂^k/∂x_j^k`. + #[must_use] pub fn diagonal(n: usize, k: u8) -> Self { assert!(k >= 1, "diagonal order must be >= 1"); let terms = (0..n) @@ -711,16 +728,19 @@ impl DiffOp { } /// The terms of the operator. + #[must_use] pub fn terms(&self) -> &[(F, MultiIndex)] { &self.terms } /// Number of variables. + #[must_use] pub fn num_vars(&self) -> usize { self.num_vars } /// Maximum total order across all terms. + #[must_use] pub fn order(&self) -> usize { self.terms .iter() @@ -730,6 +750,7 @@ impl DiffOp { } /// True if every term has exactly one active variable (no mixed partials). + #[must_use] pub fn is_diagonal(&self) -> bool { self.terms.iter().all(|(_, mi)| mi.active_vars().len() <= 1) } @@ -738,6 +759,7 @@ impl DiffOp { /// /// Returns a vector of `DiffOp`, each containing terms with the same /// total order, sorted by increasing order. + #[must_use] pub fn split_by_order(&self) -> Vec> { let mut order_map: Vec<(usize, Vec<(F, MultiIndex)>)> = Vec::new(); for (c, mi) in &self.terms { @@ -785,6 +807,7 @@ impl DiffOp { /// # Panics /// /// Panics if the operator is not homogeneous (mixed total orders). + #[must_use] pub fn sparse_distribution(&self) -> SparseSamplingDistribution { let k = self.terms[0].1.total_order(); for (_, mi) in &self.terms { @@ -870,21 +893,25 @@ pub struct SparseJetEntryRef<'a, F> { impl<'a, F: Float> SparseJetEntryRef<'a, F> { /// Slot assignments: `(var_index, slot, 1/slot!)`. + #[must_use] pub fn input_coeffs(&self) -> &[(usize, usize, F)] { &self.entry.input_coeffs } /// Which output coefficient to read. + #[must_use] pub fn output_coeff_index(&self) -> usize { self.entry.output_coeff_index } /// Extraction prefactor from the Faà di Bruno formula. + #[must_use] pub fn extraction_prefactor(&self) -> F { self.entry.extraction_prefactor } /// Sign of the operator coefficient `C_α`. + #[must_use] pub fn sign(&self) -> F { self.entry.sign } diff --git a/src/gpu/cuda_backend.rs b/src/gpu/cuda_backend.rs index 2d2be88..14e3a37 100644 --- a/src/gpu/cuda_backend.rs +++ b/src/gpu/cuda_backend.rs @@ -71,6 +71,9 @@ macro_rules! cuda_forward_batch_body { builder.arg(&nv); builder.arg(&no); builder.arg(&$batch_size); + // SAFETY: All device buffers are correctly sized for `batch_size` elements, + // the kernel was compiled from our bundled source, and the launch config + // grid/block dimensions match the batch size. unsafe { builder.launch(cfg) }.map_err(cuda_err)?; s.synchronize().map_err(cuda_err)?; @@ -117,6 +120,9 @@ macro_rules! cuda_gradient_batch_body { builder.arg(&nv); builder.arg(&no); builder.arg(&$batch_size); + // SAFETY: All device buffers are correctly sized for `batch_size` elements, + // the kernel was compiled from our bundled source, and the launch config + // grid/block dimensions match the batch size. unsafe { builder.launch(cfg) }.map_err(cuda_err)?; // Reverse pass @@ -139,6 +145,9 @@ macro_rules! cuda_gradient_batch_body { builder.arg(&ni); builder.arg(&nv); builder.arg(&$batch_size); + // SAFETY: All device buffers are correctly sized for `batch_size` elements, + // the kernel was compiled from our bundled source, and the launch config + // grid/block dimensions match the batch size. unsafe { builder.launch(cfg) }.map_err(cuda_err)?; s.synchronize().map_err(cuda_err)?; @@ -207,6 +216,9 @@ macro_rules! cuda_hvp_batch_body { builder.arg(&ni); builder.arg(&nv); builder.arg(&$batch_size); + // SAFETY: All device buffers are correctly sized for `batch_size` elements, + // the kernel was compiled from our bundled source, and the launch config + // grid/block dimensions match the batch size. unsafe { builder.launch(cfg) }.map_err(cuda_err)?; s.synchronize().map_err(cuda_err)?; @@ -285,6 +297,9 @@ macro_rules! cuda_sparse_jacobian_body { builder.arg(&nv); builder.arg(&$tape.num_outputs); builder.arg(&batch); + // SAFETY: All device buffers are correctly sized for `batch` elements, + // the kernel was compiled from our bundled source, and the launch config + // grid/block dimensions match the batch size. unsafe { builder.launch(cfg) }.map_err(cuda_err)?; s.synchronize().map_err(cuda_err)?; @@ -397,6 +412,9 @@ macro_rules! cuda_taylor_fwd_2nd_body { builder.arg(&nv); builder.arg(&no); builder.arg(&$batch_size); + // SAFETY: All device buffers are correctly sized for `batch_size` elements, + // the kernel was compiled from our bundled source, and the launch config + // grid/block dimensions match the batch size. unsafe { builder.launch(cfg) }.map_err(cuda_err)?; s.synchronize().map_err(cuda_err)?; @@ -553,6 +571,8 @@ impl CudaContext { arg1: s.clone_htod(&arg1).unwrap(), constants_f64: s.clone_htod(&constants).unwrap(), output_indices: s.clone_htod(&output_indices).unwrap(), + // SAFETY(u32 cast): tape length cannot practically exceed u32::MAX (~4.3B opcodes + // ≈ 17 GB of opcode storage alone). num_ops: tape.opcodes_slice().len() as u32, num_inputs: tape.num_inputs() as u32, num_variables: tape.num_variables_count() as u32, diff --git a/src/gpu/mod.rs b/src/gpu/mod.rs index 09acac8..e6a1249 100644 --- a/src/gpu/mod.rs +++ b/src/gpu/mod.rs @@ -211,36 +211,39 @@ pub struct GpuTapeData { } impl GpuTapeData { - /// Convert a `BytecodeTape` to GPU-uploadable format. - /// - /// Returns `Err(CustomOpsNotSupported)` if the tape contains custom ops, - /// since custom Rust closures cannot execute on GPU hardware. - pub fn from_tape(tape: &BytecodeTape) -> Result { - if tape.has_custom_ops() { - return Err(GpuError::CustomOpsNotSupported); - } - + /// Build `GpuTapeData` from a tape's structural data and pre-converted constants. + fn build_from_tape( + tape: &BytecodeTape, + constants: Vec, + ) -> Self { let opcodes_raw = tape.opcodes_slice(); let args = tape.arg_indices_slice(); - let vals = tape.values_slice(); let n = opcodes_raw.len(); - let opcodes: Vec = opcodes_raw.iter().map(|op| *op as u32).collect(); - let arg0: Vec = args.iter().map(|a| a[0]).collect(); - let arg1: Vec = args.iter().map(|a| a[1]).collect(); - let constants: Vec = vals.to_vec(); - - Ok(GpuTapeData { - opcodes, - arg0, - arg1, + GpuTapeData { + opcodes: opcodes_raw.iter().map(|op| *op as u32).collect(), + arg0: args.iter().map(|a| a[0]).collect(), + arg1: args.iter().map(|a| a[1]).collect(), constants, + // SAFETY(u32 cast): n is the number of tape opcodes. Exceeding u32::MAX (~4.3B) + // would require ~17 GB of opcode storage alone, which is impractical. num_ops: n as u32, num_inputs: tape.num_inputs() as u32, num_variables: tape.num_variables_count() as u32, output_index: tape.output_index() as u32, output_indices: tape.all_output_indices().to_vec(), - }) + } + } + + /// Convert a `BytecodeTape` to GPU-uploadable format. + /// + /// Returns `Err(CustomOpsNotSupported)` if the tape contains custom ops, + /// since custom Rust closures cannot execute on GPU hardware. + pub fn from_tape(tape: &BytecodeTape) -> Result { + if tape.has_custom_ops() { + return Err(GpuError::CustomOpsNotSupported); + } + Ok(Self::build_from_tape(tape, tape.values_slice().to_vec())) } /// Convert a `BytecodeTape` to GPU-uploadable f32 format. @@ -253,28 +256,8 @@ impl GpuTapeData { if tape.has_custom_ops() { return Err(GpuError::CustomOpsNotSupported); } - - let opcodes_raw = tape.opcodes_slice(); - let args = tape.arg_indices_slice(); - let vals = tape.values_slice(); - let n = opcodes_raw.len(); - - let opcodes: Vec = opcodes_raw.iter().map(|op| *op as u32).collect(); - let arg0: Vec = args.iter().map(|a| a[0]).collect(); - let arg1: Vec = args.iter().map(|a| a[1]).collect(); - let constants: Vec = vals.iter().map(|&v| v as f32).collect(); - - Ok(GpuTapeData { - opcodes, - arg0, - arg1, - constants, - num_ops: n as u32, - num_inputs: tape.num_inputs() as u32, - num_variables: tape.num_variables_count() as u32, - output_index: tape.output_index() as u32, - output_indices: tape.all_output_indices().to_vec(), - }) + let constants = tape.values_slice().iter().map(|&v| v as f32).collect(); + Ok(Self::build_from_tape(tape, constants)) } } diff --git a/src/nonsmooth.rs b/src/nonsmooth.rs index 40612a0..d98bb83 100644 --- a/src/nonsmooth.rs +++ b/src/nonsmooth.rs @@ -72,6 +72,7 @@ impl NonsmoothInfo { /// Branch signature: `(tape_index, branch)` pairs for all kinks. /// /// Two evaluations at the same input produce the same signature. + #[must_use] pub fn signature(&self) -> Vec<(u32, i8)> { self.kinks .iter() diff --git a/src/opcode.rs b/src/opcode.rs index aa338d4..95291cd 100644 --- a/src/opcode.rs +++ b/src/opcode.rs @@ -100,6 +100,7 @@ pub enum OpCode { /// Use [`has_nontrivial_subdifferential`] to distinguish the subset that /// contributes distinct limiting Jacobians for Clarke enumeration. #[inline] +#[must_use] pub fn is_nonsmooth(op: OpCode) -> bool { matches!( op, @@ -122,6 +123,7 @@ pub fn is_nonsmooth(op: OpCode) -> bool { /// identical partials, so enumerating them in Clarke Jacobian adds cost with no /// information. #[inline] +#[must_use] pub fn has_nontrivial_subdifferential(op: OpCode) -> bool { matches!(op, OpCode::Abs | OpCode::Min | OpCode::Max) } @@ -382,6 +384,7 @@ fn powi_exp_decode(b: T) -> i32 { /// Encode a `powi` exponent as a value that can be stored in `arg_indices[1]`. #[inline] +#[must_use] pub fn powi_exp_encode(exp: i32) -> u32 { exp as u32 } diff --git a/src/sparse.rs b/src/sparse.rs index ab027a2..cb8df5b 100644 --- a/src/sparse.rs +++ b/src/sparse.rs @@ -21,16 +21,19 @@ pub struct SparsityPattern { impl SparsityPattern { /// Number of non-zero entries in the pattern. + #[must_use] pub fn nnz(&self) -> usize { self.rows.len() } /// Whether the pattern is empty (all zeros). + #[must_use] pub fn is_empty(&self) -> bool { self.rows.is_empty() } /// Check if position (i, j) is in the pattern (checks both (i,j) and (j,i)). + #[must_use] pub fn contains(&self, i: usize, j: usize) -> bool { let (r, c) = if i >= j { (i, j) } else { (j, i) }; self.rows @@ -269,6 +272,7 @@ pub(crate) fn extract_bits(bitset: &[u64], max_bits: usize) -> Vec { /// /// Returns `(colors, num_colors)` where `colors[i]` is the color assigned to input `i`. /// Vertices are visited in decreasing-degree order for better results. +#[must_use] pub fn greedy_coloring(pattern: &SparsityPattern) -> (Vec, u32) { let n = pattern.dim; if n == 0 { @@ -309,64 +313,7 @@ pub fn greedy_coloring(pattern: &SparsityPattern) -> (Vec, u32) { adj2[v].dedup(); } - // Sort vertices by decreasing degree in G^2 - let mut order: Vec = (0..n).collect(); - order.sort_by(|&a, &b| adj2[b].len().cmp(&adj2[a].len())); - - let mut colors = vec![u32::MAX; n]; - let mut num_colors = 0u32; - - for &v in &order { - // Find smallest available color using u64 bitset (supports up to 64 colors). - // For virtually all practical sparse Hessians, 64 colors is sufficient. - // Falls back to linear scan if > 64 colors are needed. - let mut used_bits: u64 = 0; - let mut needs_fallback = false; - for &neighbor in &adj2[v] { - let c = colors[neighbor as usize]; - if c != u32::MAX { - if c < 64 { - used_bits |= 1u64 << c; - } else { - needs_fallback = true; - } - } - } - - let color = if !needs_fallback { - (!used_bits).trailing_zeros() - } else { - // Fallback: collect all used colors and find first gap - let mut used_vec: Vec = adj2[v] - .iter() - .filter_map(|&neighbor| { - let c = colors[neighbor as usize]; - if c != u32::MAX { - Some(c) - } else { - None - } - }) - .collect(); - used_vec.sort_unstable(); - used_vec.dedup(); - let mut c = 0u32; - for &u in &used_vec { - if u != c { - break; - } - c += 1; - } - c - }; - - colors[v] = color; - if color + 1 > num_colors { - num_colors = color + 1; - } - } - - (colors, num_colors) + greedy_distance1_coloring(&adj2, n) } /// Compressed Sparse Row (CSR) format for sparse matrices. @@ -382,6 +329,7 @@ pub struct CsrPattern { impl CsrPattern { /// Number of stored entries. + #[must_use] pub fn nnz(&self) -> usize { self.col_ind.len() } @@ -418,6 +366,7 @@ impl CsrPattern { impl SparsityPattern { /// Convert to CSR format preserving the lower triangle (matching COO storage). + #[must_use] pub fn to_csr_lower(&self) -> CsrPattern { let n = self.dim; let mut row_ptr = vec![0u32; n + 1]; @@ -453,6 +402,7 @@ impl SparsityPattern { /// /// For every off-diagonal entry (r, c) in the lower triangle, also stores /// the mirrored entry (c, r). Diagonal entries appear once. + #[must_use] pub fn to_csr(&self) -> CsrPattern { let n = self.dim; let mut row_ptr = vec![0u32; n + 1]; @@ -524,16 +474,19 @@ pub struct JacobianSparsityPattern { impl JacobianSparsityPattern { /// Number of structural non-zeros in the pattern. + #[must_use] pub fn nnz(&self) -> usize { self.rows.len() } /// Whether the pattern is empty (all zeros). + #[must_use] pub fn is_empty(&self) -> bool { self.rows.is_empty() } /// Check if position (output_idx, input_idx) is in the pattern. + #[must_use] pub fn contains(&self, output_idx: usize, input_idx: usize) -> bool { self.rows .iter() @@ -612,37 +565,14 @@ pub(crate) fn detect_jacobian_sparsity_impl( /// applies greedy distance-1 coloring with degree ordering. /// /// Returns `(colors, num_colors)` where `colors[j]` is the color of column j. +#[must_use] pub fn column_coloring(pattern: &JacobianSparsityPattern) -> (Vec, u32) { - let n = pattern.num_inputs; - if n == 0 { - return (Vec::new(), 0); - } - - // Build row -> columns map - let mut row_to_cols: Vec> = vec![Vec::new(); pattern.num_outputs]; - for (&r, &c) in pattern.rows.iter().zip(pattern.cols.iter()) { - row_to_cols[r as usize].push(c); - } - - // Build column adjacency: columns sharing a row are adjacent - let mut adj: Vec> = vec![Vec::new(); n]; - for cols_in_row in &row_to_cols { - for i in 0..cols_in_row.len() { - for j in (i + 1)..cols_in_row.len() { - let c1 = cols_in_row[i] as usize; - let c2 = cols_in_row[j] as usize; - adj[c1].push(c2 as u32); - adj[c2].push(c1 as u32); - } - } - } - // Deduplicate adjacency lists - for list in &mut adj { - list.sort_unstable(); - list.dedup(); - } - - greedy_distance1_coloring(&adj, n) + intersection_graph_coloring( + &pattern.rows, + &pattern.cols, + pattern.num_outputs, + pattern.num_inputs, + ) } /// Row coloring for reverse-mode Jacobian compression. @@ -652,37 +582,53 @@ pub fn column_coloring(pattern: &JacobianSparsityPattern) -> (Vec, u32) { /// greedy distance-1 coloring with degree ordering. /// /// Returns `(colors, num_colors)` where `colors[i]` is the color of row i. +#[must_use] pub fn row_coloring(pattern: &JacobianSparsityPattern) -> (Vec, u32) { - let n = pattern.num_outputs; - if n == 0 { + intersection_graph_coloring( + &pattern.cols, + &pattern.rows, + pattern.num_inputs, + pattern.num_outputs, + ) +} + +/// Build an intersection graph and color it greedily. +/// +/// Groups entries by `group_keys` (dimension `group_dim`), then colors `color_keys` +/// (dimension `color_dim`). Two color-keys that appear in the same group need +/// different colors. +fn intersection_graph_coloring( + group_keys: &[u32], + color_keys: &[u32], + group_dim: usize, + color_dim: usize, +) -> (Vec, u32) { + if color_dim == 0 { return (Vec::new(), 0); } - // Build column -> rows map - let mut col_to_rows: Vec> = vec![Vec::new(); pattern.num_inputs]; - for (&r, &c) in pattern.rows.iter().zip(pattern.cols.iter()) { - col_to_rows[c as usize].push(r); + let mut groups: Vec> = vec![Vec::new(); group_dim]; + for (&g, &c) in group_keys.iter().zip(color_keys.iter()) { + groups[g as usize].push(c); } - // Build row adjacency: rows sharing a column are adjacent - let mut adj: Vec> = vec![Vec::new(); n]; - for rows_in_col in &col_to_rows { - for i in 0..rows_in_col.len() { - for j in (i + 1)..rows_in_col.len() { - let r1 = rows_in_col[i] as usize; - let r2 = rows_in_col[j] as usize; - adj[r1].push(r2 as u32); - adj[r2].push(r1 as u32); + let mut adj: Vec> = vec![Vec::new(); color_dim]; + for members in &groups { + for i in 0..members.len() { + for j in (i + 1)..members.len() { + let a = members[i] as usize; + let b = members[j] as usize; + adj[a].push(b as u32); + adj[b].push(a as u32); } } } - // Deduplicate adjacency lists for list in &mut adj { list.sort_unstable(); list.dedup(); } - greedy_distance1_coloring(&adj, n) + greedy_distance1_coloring(&adj, color_dim) } /// Greedy distance-1 coloring with degree ordering and u64 bitset fast path. diff --git a/src/stde.rs b/src/stde.rs deleted file mode 100644 index 5123904..0000000 --- a/src/stde.rs +++ /dev/null @@ -1,1409 +0,0 @@ -//! Stochastic Taylor Derivative Estimators (STDE). -//! -//! Estimate differential operators (Laplacian, Hessian diagonal, directional -//! derivatives) by pushing random direction vectors through Taylor-mode AD. -//! -//! # How it works -//! -//! For f: R^n -> R at point x, define g(t) = f(x + t*v) where v is a -//! direction vector. The Taylor coefficients of g at t=0 are: -//! -//! - c0 = f(x) -//! - c1 = nabla f(x) . v (directional first derivative) -//! - c2 = v^T H_f(x) v / 2 (half the directional second derivative) -//! -//! By choosing v appropriately (Rademacher, Gaussian, coordinate basis), -//! we can estimate operators like the Laplacian in O(S*K*L) time instead -//! of O(n^2*L) for the full Hessian. -//! -//! # Variance properties -//! -//! The Hutchinson estimator `(1/S) sum_s v_s^T H v_s` is unbiased when -//! E\[vv^T\] = I. Its variance depends on the distribution of v: -//! -//! - **Rademacher** (entries ±1): Var = `2 sum_{i≠j} H_ij^2`. The diagonal -//! contributes zero variance since `v_i^2 = 1` always. -//! - **Gaussian** (v ~ N(0,I)): Var = `2 ||H||_F^2`. Higher variance than -//! Rademacher because `v_i^2 ~ chi-squared(1)` introduces diagonal noise. -//! -//! The [`laplacian_with_control`] function reduces Gaussian variance to match -//! Rademacher by subtracting the exact diagonal contribution (a control -//! variate). This requires the diagonal from [`hessian_diagonal`] (n extra -//! evaluations). For Rademacher directions, the control variate has no effect -//! since the diagonal variance is already zero. -//! -//! **Antithetic sampling** (pairing +v with -v) does **not** reduce variance -//! for trace estimation. The Hutchinson sample `v^T H v` is quadratic (even) -//! in v, so `(-v)^T H (-v) = v^T H v` — antithetic pairs are identical. -//! -//! # Design -//! -//! - **No `rand` dependency**: all functions accept user-provided direction -//! vectors. The library stays pure; users bring their own RNG. -//! - **`Taylor`** for second-order operators: stack-allocated, Copy, -//! monomorphized. The order K=3 is statically known. -//! - **`TaylorDyn`** variants for runtime-determined order. -//! - **Panics on misuse**: dimension mismatches panic, following existing -//! API conventions (`record`, `grad`, `hvp`). -//! -//! # Const-Generic Higher-Order Diagonal -//! -//! [`diagonal_kth_order_const`] is a stack-allocated variant of -//! [`diagonal_kth_order`] for compile-time-known derivative order. It uses -//! `Taylor` directly (no `TaylorDyn` arena), which is faster when -//! the order is statically known. `ORDER = k + 1` where k is the derivative -//! order. For f32, practical limit is `ORDER ≤ 14` (k ≤ 13) since `k! > 2^23` -//! causes precision loss. -//! -//! # Higher-Order Estimation -//! -//! [`diagonal_kth_order`] generalises [`hessian_diagonal`] from k=2 to -//! arbitrary k, computing exact `[∂^k u/∂x_j^k]` for all coordinates via -//! `TaylorDyn` jets of order k+1. The stochastic variant -//! [`diagonal_kth_order_stochastic`] subsamples coordinates for an unbiased -//! estimate of `Σ_j ∂^k u/∂x_j^k`. -//! -//! # Dense STDE for Positive-Definite Operators -//! -//! [`dense_stde_2nd`] estimates `tr(C · H_u(x)) = Σ_{ij} C_{ij} ∂²u/∂x_i∂x_j` -//! for a positive-definite coefficient matrix C. The caller provides a Cholesky -//! factor L (such that `C = L L^T`) and standard Gaussian vectors z. Internally, -//! `v = L · z` is computed, then pushed through a second-order Taylor jet. When -//! `L = I`, this reduces to the Hutchinson Laplacian estimator. -//! -//! # Parabolic PDE σ-Transform -//! -//! [`parabolic_diffusion`] computes `½ tr(σσ^T · Hess u)` for parabolic PDEs -//! (Fokker-Planck, Black-Scholes, HJB) by pushing columns of σ through -//! second-order Taylor jets. This avoids forming the off-diagonal Hessian -//! entries that a naïve `tr(A·H)` computation would require. -//! -//! # Sparse STDE (requires `stde` + `diffop`) -//! -//! `stde_sparse` estimates arbitrary differential operators `Lu(x)` by -//! sampling sparse k-jets from a `SparseSamplingDistribution` built via -//! `DiffOp::sparse_distribution`. Each sample is a single forward -//! pushforward; the per-sample estimator is `sign(C_α) · Z · D^α u` where -//! Z = Σ|C_α| is the normalization constant. This implements the core -//! contribution of Shi et al. (NeurIPS 2024). - -use crate::bytecode_tape::BytecodeTape; -use crate::dual::Dual; -use crate::taylor::Taylor; -use crate::taylor_dyn::{TaylorArenaLocal, TaylorDyn, TaylorDynGuard}; -use crate::Float; - -// ══════════════════════════════════════════════ -// Result types -// ══════════════════════════════════════════════ - -/// Result of a stochastic estimation with sample statistics. -/// -/// Contains the function value, the estimated operator value, and -/// sample statistics (variance, standard error) that quantify -/// estimator quality. -#[derive(Clone, Debug)] -pub struct EstimatorResult { - /// Function value f(x). - pub value: F, - /// Estimated operator value (e.g. Laplacian). - pub estimate: F, - /// Sample variance of the per-direction estimates. - /// Zero when `num_samples == 1` (undefined, clamped to zero). - pub sample_variance: F, - /// Standard error of the mean: `sqrt(sample_variance / num_samples)`. - pub standard_error: F, - /// Number of direction samples used. - pub num_samples: usize, -} - -// ══════════════════════════════════════════════ -// Welford online accumulator -// ══════════════════════════════════════════════ - -/// Welford's online algorithm for incremental mean and variance. -struct WelfordAccumulator { - mean: F, - m2: F, - count: usize, -} - -impl WelfordAccumulator { - fn new() -> Self { - Self { - mean: F::zero(), - m2: F::zero(), - count: 0, - } - } - - fn update(&mut self, sample: F) { - self.count += 1; - let k1 = F::from(self.count).unwrap(); - let delta = sample - self.mean; - self.mean = self.mean + delta / k1; - let delta2 = sample - self.mean; - self.m2 = self.m2 + delta * delta2; - } - - fn finalize(&self) -> (F, F, F) { - let nf = F::from(self.count).unwrap(); - if self.count > 1 { - let var = self.m2 / (nf - F::one()); - (self.mean, var, (var / nf).sqrt()) - } else { - (self.mean, F::zero(), F::zero()) - } - } -} - -// ══════════════════════════════════════════════ -// Estimator trait + built-in estimators -// ══════════════════════════════════════════════ - -/// Trait for stochastic estimators that combine Taylor jet coefficients into -/// a per-direction sample. -/// -/// Given the Taylor coefficients `(c0, c1, c2)` from propagating a random -/// direction through a tape, an `Estimator` produces a scalar sample whose -/// expectation (over random directions with `E[vv^T] = I`) equals the -/// desired quantity. -pub trait Estimator { - /// Compute one sample from the Taylor jet coefficients. - /// - /// - `c0` = f(x) - /// - `c1` = ∇f(x)·v (directional first derivative) - /// - `c2` = v^T H v / 2 (half directional second derivative) - fn sample(&self, c0: F, c1: F, c2: F) -> F; -} - -/// Hutchinson trace estimator: estimates tr(H) = Laplacian. -/// -/// Each sample is `2 * c2 = v^T H v`. Since `E[v^T H v] = tr(H)` when -/// `E[vv^T] = I`, the mean of these samples converges to the Laplacian. -pub struct Laplacian; - -impl Estimator for Laplacian { - #[inline] - fn sample(&self, _c0: F, _c1: F, c2: F) -> F { - F::from(2.0).unwrap() * c2 - } -} - -/// Estimates `||∇f||²` (squared gradient norm). -/// -/// Each sample is `c1² = (∇f·v)²`. Since `E[(∇f·v)²] = ∇f^T E[vv^T] ∇f = ||∇f||²` -/// when `E[vv^T] = I`, the mean converges to the squared gradient norm. -/// -/// Useful for score matching loss functions where `||∇ log p||²` appears. -pub struct GradientSquaredNorm; - -impl Estimator for GradientSquaredNorm { - #[inline] - fn sample(&self, _c0: F, c1: F, _c2: F) -> F { - c1 * c1 - } -} - -// ══════════════════════════════════════════════ -// Generic estimation pipeline -// ══════════════════════════════════════════════ - -/// Result of a divergence estimation. -/// -/// Separate from [`EstimatorResult`] because the function output is a vector -/// (`values: Vec`) rather than a scalar (`value: F`). -#[derive(Clone, Debug)] -pub struct DivergenceResult { - /// Function output vector f(x). - pub values: Vec, - /// Estimated divergence (trace of the Jacobian). - pub estimate: F, - /// Sample variance of per-direction estimates. - pub sample_variance: F, - /// Standard error of the mean. - pub standard_error: F, - /// Number of direction samples used. - pub num_samples: usize, -} - -/// Estimate a quantity using the given [`Estimator`] and Welford's online algorithm. -/// -/// Evaluates the tape at `x` for each direction, computes the estimator's sample -/// from the Taylor jet, and aggregates with running mean and variance. -/// -/// # Panics -/// -/// Panics if `directions` is empty or any direction's length does not match -/// `tape.num_inputs()`. -pub fn estimate( - estimator: &impl Estimator, - tape: &BytecodeTape, - x: &[F], - directions: &[&[F]], -) -> EstimatorResult { - assert!(!directions.is_empty(), "directions must not be empty"); - - let mut buf = Vec::new(); - let mut value = F::zero(); - let mut acc = WelfordAccumulator::new(); - - for v in directions.iter() { - let (c0, c1, c2) = taylor_jet_2nd_with_buf(tape, x, v, &mut buf); - value = c0; - acc.update(estimator.sample(c0, c1, c2)); - } - - let (estimate, sample_variance, standard_error) = acc.finalize(); - - EstimatorResult { - value, - estimate, - sample_variance, - standard_error, - num_samples: directions.len(), - } -} - -/// Estimate a quantity using importance-weighted samples (West's 1979 algorithm). -/// -/// Each direction `directions[s]` has an associated weight `weights[s]`. -/// The weighted mean is `Σ(w_s * sample_s) / Σ(w_s)` and the variance uses -/// the reliability-weight Bessel correction: `M2 / (W - W2/W)` where -/// `W = Σw_s` and `W2 = Σw_s²`. -/// -/// # Panics -/// -/// Panics if `directions` is empty, `weights.len() != directions.len()`, -/// or any direction's length does not match `tape.num_inputs()`. -pub fn estimate_weighted( - estimator: &impl Estimator, - tape: &BytecodeTape, - x: &[F], - directions: &[&[F]], - weights: &[F], -) -> EstimatorResult { - assert!(!directions.is_empty(), "directions must not be empty"); - assert_eq!( - weights.len(), - directions.len(), - "weights.len() must match directions.len()" - ); - - let mut buf = Vec::new(); - let mut value = F::zero(); - - // West's (1979) weighted online algorithm - let mut w_sum = F::zero(); - let mut w_sum2 = F::zero(); - let mut mean = F::zero(); - let mut m2 = F::zero(); - - for (k, v) in directions.iter().enumerate() { - let (c0, c1, c2) = taylor_jet_2nd_with_buf(tape, x, v, &mut buf); - value = c0; - let s = estimator.sample(c0, c1, c2); - let w = weights[k]; - - w_sum = w_sum + w; - w_sum2 = w_sum2 + w * w; - let delta = s - mean; - mean = mean + (w / w_sum) * delta; - let delta2 = s - mean; - m2 = m2 + w * delta * delta2; - } - - let n = directions.len(); - let denom = w_sum - w_sum2 / w_sum; - let (sample_variance, standard_error) = if n > 1 && denom > F::zero() { - let var = m2 / denom; - let nf = F::from(n).unwrap(); - (var, (var / nf).sqrt()) - } else { - (F::zero(), F::zero()) - }; - - EstimatorResult { - value, - estimate: mean, - sample_variance, - standard_error, - num_samples: n, - } -} - -// ══════════════════════════════════════════════ -// Low-level: single-direction jet propagation -// ══════════════════════════════════════════════ - -/// Propagate direction `v` through tape using second-order Taylor mode. -/// -/// Constructs `Taylor` inputs where `input[i] = [x[i], v[i], 0]`, -/// runs `forward_tangent`, and extracts the output coefficients. -/// -/// Returns `(f(x), nabla_f . v, v^T H v / 2)`. -/// -/// # Panics -/// -/// Panics if `x.len()` or `v.len()` does not match `tape.num_inputs()`. -pub fn taylor_jet_2nd(tape: &BytecodeTape, x: &[F], v: &[F]) -> (F, F, F) { - let mut buf = Vec::new(); - taylor_jet_2nd_with_buf(tape, x, v, &mut buf) -} - -/// Like [`taylor_jet_2nd`] but reuses a caller-provided buffer to avoid -/// reallocation across multiple calls. -/// -/// # Panics -/// -/// Panics if `x.len()` or `v.len()` does not match `tape.num_inputs()`. -pub fn taylor_jet_2nd_with_buf( - tape: &BytecodeTape, - x: &[F], - v: &[F], - buf: &mut Vec>, -) -> (F, F, F) { - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - assert_eq!(v.len(), n, "v.len() must match tape.num_inputs()"); - - let inputs: Vec> = x - .iter() - .zip(v.iter()) - .map(|(&xi, &vi)| Taylor::new([xi, vi, F::zero()])) - .collect(); - - tape.forward_tangent(&inputs, buf); - - let out = buf[tape.output_index()]; - (out.coeffs[0], out.coeffs[1], out.coeffs[2]) -} - -// ══════════════════════════════════════════════ -// Mid-level: batch direction evaluation -// ══════════════════════════════════════════════ - -/// Evaluate multiple directions through the tape. -/// -/// Returns `(value, first_order, second_order)` where: -/// - `value` = f(x) -/// - `first_order[s]` = nabla_f . v_s (directional first derivative) -/// - `second_order[s]` = v_s^T H v_s / 2 (half directional second derivative) -/// -/// # Panics -/// -/// Panics if any direction's length does not match `tape.num_inputs()`. -pub fn directional_derivatives( - tape: &BytecodeTape, - x: &[F], - directions: &[&[F]], -) -> (F, Vec, Vec) { - let mut buf = Vec::new(); - let mut first_order = Vec::with_capacity(directions.len()); - let mut second_order = Vec::with_capacity(directions.len()); - let mut value = F::zero(); - - for v in directions { - let (c0, c1, c2) = taylor_jet_2nd_with_buf(tape, x, v, &mut buf); - value = c0; - first_order.push(c1); - second_order.push(c2); - } - - (value, first_order, second_order) -} - -// ══════════════════════════════════════════════ -// High-level: operator estimators -// ══════════════════════════════════════════════ - -/// Estimate the Laplacian (trace of Hessian) via Hutchinson's trace estimator. -/// -/// Directions must satisfy E[vv^T] = I (e.g. Rademacher vectors with entries -/// +/-1, or standard Gaussian vectors). The estimator is: -/// -/// Laplacian ~ (1/S) * sum_s 2*c2_s -/// -/// where c2_s is the second Taylor coefficient for direction s. -/// -/// Returns `(value, laplacian_estimate)`. -/// -/// Note: coordinate basis vectors do **not** satisfy E[vv^T] = I and will -/// give tr(H)/n instead of tr(H). Use [`hessian_diagonal`] and sum for exact -/// computation via coordinate directions. -/// -/// # Panics -/// -/// Panics if `directions` is empty or any direction's length does not match -/// `tape.num_inputs()`. -pub fn laplacian(tape: &BytecodeTape, x: &[F], directions: &[&[F]]) -> (F, F) { - assert!(!directions.is_empty(), "directions must not be empty"); - - let (value, _, second_order) = directional_derivatives(tape, x, directions); - - let two = F::from(2.0).unwrap(); - let s = F::from(directions.len()).unwrap(); - let sum: F = second_order - .iter() - .fold(F::zero(), |acc, &c2| acc + two * c2); - let laplacian = sum / s; - - (value, laplacian) -} - -/// Estimate the Laplacian with sample statistics via Hutchinson's trace estimator. -/// -/// Same estimator as [`laplacian`], but additionally computes sample variance -/// and standard error using Welford's online algorithm (numerically stable, -/// single pass). -/// -/// Each direction produces a sample `2 * c2_s`. The returned statistics -/// describe the distribution of these samples. -/// -/// # Panics -/// -/// Panics if `directions` is empty or any direction's length does not match -/// `tape.num_inputs()`. -pub fn laplacian_with_stats( - tape: &BytecodeTape, - x: &[F], - directions: &[&[F]], -) -> EstimatorResult { - estimate(&Laplacian, tape, x, directions) -} - -/// Estimate the Laplacian with a diagonal control variate. -/// -/// Uses the exact Hessian diagonal (from [`hessian_diagonal`]) as a control -/// variate to reduce estimator variance. Each raw Hutchinson sample -/// `raw_s = v^T H v = 2 * c2_s` is adjusted: -/// -/// ```text -/// adjusted_s = raw_s - sum_j(D_jj * v_j^2) + tr(D) -/// ``` -/// -/// where D is the diagonal of H. The adjustment subtracts the noisy diagonal -/// contribution and adds back its exact expectation `tr(D)`. -/// -/// **Effect by distribution**: -/// - **Gaussian**: reduces variance from `2||H||_F^2` to `2 sum_{i≠j} H_ij^2` -/// (matching Rademacher performance). -/// - **Rademacher**: no effect, since `v_j^2 = 1` always, so the adjustment -/// is `sum_j D_jj * 1 - tr(D) = 0`. -/// -/// Returns an [`EstimatorResult`] with statistics computed over the adjusted -/// samples. -/// -/// # Panics -/// -/// Panics if `directions` is empty, if any direction's length does not match -/// `tape.num_inputs()`, or if `control_diagonal.len() != tape.num_inputs()`. -pub fn laplacian_with_control( - tape: &BytecodeTape, - x: &[F], - directions: &[&[F]], - control_diagonal: &[F], -) -> EstimatorResult { - assert!(!directions.is_empty(), "directions must not be empty"); - let n = tape.num_inputs(); - assert_eq!( - control_diagonal.len(), - n, - "control_diagonal.len() must match tape.num_inputs()" - ); - - let two = F::from(2.0).unwrap(); - let trace_control: F = control_diagonal - .iter() - .copied() - .fold(F::zero(), |a, b| a + b); - - let mut buf = Vec::new(); - let mut value = F::zero(); - let mut acc = WelfordAccumulator::new(); - - for v in directions.iter() { - let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, v, &mut buf); - value = c0; - - let raw = two * c2; - - // Control variate: subtract v^T D v, add back E[v^T D v] = tr(D) - let cv: F = control_diagonal - .iter() - .zip(v.iter()) - .fold(F::zero(), |acc, (&d, &vi)| acc + d * vi * vi); - acc.update(raw - cv + trace_control); - } - - let (estimate, sample_variance, standard_error) = acc.finalize(); - - EstimatorResult { - value, - estimate, - sample_variance, - standard_error, - num_samples: directions.len(), - } -} - -/// Exact Hessian diagonal via n coordinate-direction evaluations. -/// -/// For each coordinate j, pushes basis vector e_j through the tape and -/// reads `2 * c2`, which equals `d^2 f / dx_j^2`. -/// -/// Returns `(value, diag)` where `diag[j] = d^2 f / dx_j^2`. -pub fn hessian_diagonal(tape: &BytecodeTape, x: &[F]) -> (F, Vec) { - let mut buf = Vec::new(); - hessian_diagonal_with_buf(tape, x, &mut buf) -} - -/// Like [`hessian_diagonal`] but reuses a caller-provided buffer. -pub fn hessian_diagonal_with_buf( - tape: &BytecodeTape, - x: &[F], - buf: &mut Vec>, -) -> (F, Vec) { - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - - let two = F::from(2.0).unwrap(); - let mut diag = Vec::with_capacity(n); - let mut value = F::zero(); - - // Build basis vector once, mutate the hot coordinate - let mut e = vec![F::zero(); n]; - for j in 0..n { - e[j] = F::one(); - let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, &e, buf); - value = c0; - diag.push(two * c2); - e[j] = F::zero(); - } - - (value, diag) -} - -// ══════════════════════════════════════════════ -// Higher-order diagonal estimation -// ══════════════════════════════════════════════ - -/// Exact k-th order diagonal: `[∂^k u/∂x_j^k for j in 0..n]`. -/// -/// Pushes n basis vectors through order-(k+1) `TaylorDyn` jets. For each -/// coordinate j, the input jet has `coeffs_j = [x_j, 1, 0, ..., 0]` and all -/// other inputs `coeffs_i = [x_i, 0, ..., 0]`. The output coefficient at -/// index k stores `∂^k u/∂x_j^k / k!`, so the derivative is `k! * coeffs[k]`. -/// -/// Requires `F: Float + TaylorArenaLocal` because the jet order is -/// runtime-determined (unlike [`hessian_diagonal`] which uses const-generic -/// `Taylor`). -/// -/// # Panics -/// -/// Panics if `k < 2`, `k > 20` (factorial overflow guard for f64), or if -/// `x.len()` does not match `tape.num_inputs()`. -pub fn diagonal_kth_order( - tape: &BytecodeTape, - x: &[F], - k: usize, -) -> (F, Vec) { - let mut buf = Vec::new(); - diagonal_kth_order_with_buf(tape, x, k, &mut buf) -} - -/// Like [`diagonal_kth_order`] but reuses a caller-provided buffer. -pub fn diagonal_kth_order_with_buf( - tape: &BytecodeTape, - x: &[F], - k: usize, - buf: &mut Vec>, -) -> (F, Vec) { - assert!(k >= 2, "k must be >= 2 (use gradient for k=1)"); - assert!(k <= 20, "k must be <= 20 (k! overflows f64 for k > 20)"); - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - - let order = k + 1; // number of Taylor coefficients - let _guard = TaylorDynGuard::::new(order); - - let mut k_factorial = F::one(); - for i in 2..=k { - k_factorial = k_factorial * F::from(i).unwrap(); - } - - let mut diag = Vec::with_capacity(n); - let mut value = F::zero(); - - for j in 0..n { - // Build TaylorDyn inputs: coeffs_j = [x_j, 1, 0, ..., 0], others = [x_i, 0, ..., 0] - let inputs: Vec> = (0..n) - .map(|i| { - let mut coeffs = vec![F::zero(); order]; - coeffs[0] = x[i]; - if i == j { - coeffs[1] = F::one(); - } - TaylorDyn::from_coeffs(&coeffs) - }) - .collect(); - - tape.forward_tangent(&inputs, buf); - - let out_coeffs = buf[tape.output_index()].coeffs(); - value = out_coeffs[0]; - diag.push(k_factorial * out_coeffs[k]); - } - - (value, diag) -} - -/// Exact k-th order diagonal using const-generic `Taylor`. -/// -/// `ORDER = k + 1` where `k = ORDER - 1` is the derivative order. Uses -/// stack-allocated Taylor coefficients — faster than [`diagonal_kth_order`] -/// when k is known at compile time. -/// -/// # Precision Note -/// -/// For f32, `k! > 2^23` when k ≥ 13 (ORDER ≥ 14), causing precision loss. -/// The existing [`diagonal_kth_order`] has a runtime guard `k ≤ 20` for f64; -/// the const-generic version has no such guard but users should be aware -/// that for f32, ORDER ≤ 14 is the practical limit. -/// -/// # Panics -/// -/// Compile-time error if `ORDER < 3` (i.e., k < 2). -/// Runtime panic if `x.len()` does not match `tape.num_inputs()`. -pub fn diagonal_kth_order_const( - tape: &BytecodeTape, - x: &[F], -) -> (F, Vec) { - let mut buf: Vec> = Vec::new(); - diagonal_kth_order_const_with_buf(tape, x, &mut buf) -} - -/// Like [`diagonal_kth_order_const`] but reuses a caller-provided buffer. -pub fn diagonal_kth_order_const_with_buf( - tape: &BytecodeTape, - x: &[F], - buf: &mut Vec>, -) -> (F, Vec) { - const { assert!(ORDER >= 3, "ORDER must be >= 3 (k=ORDER-1 >= 2)") } - - let k = ORDER - 1; - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - - let mut k_factorial = F::one(); - for i in 2..=k { - k_factorial = k_factorial * F::from(i).unwrap(); - } - - let mut diag = Vec::with_capacity(n); - let mut value = F::zero(); - - for j in 0..n { - let inputs: Vec> = (0..n) - .map(|i| { - let mut coeffs = [F::zero(); ORDER]; - coeffs[0] = x[i]; - if i == j { - coeffs[1] = F::one(); - } - Taylor::new(coeffs) - }) - .collect(); - - tape.forward_tangent(&inputs, buf); - - let out = buf[tape.output_index()]; - value = out.coeffs[0]; - diag.push(k_factorial * out.coeffs[k]); - } - - (value, diag) -} - -/// Stochastic k-th order diagonal estimate: `Σ_j ∂^k u/∂x_j^k`. -/// -/// Evaluates only the coordinate indices in `sampled_indices`, then scales -/// by `n / |J|` to produce an unbiased estimate of the full sum. -/// -/// # Panics -/// -/// Panics if `sampled_indices` is empty, `k < 2`, `k > 20`, or if -/// `x.len()` does not match `tape.num_inputs()`. -pub fn diagonal_kth_order_stochastic( - tape: &BytecodeTape, - x: &[F], - k: usize, - sampled_indices: &[usize], -) -> EstimatorResult { - assert!( - !sampled_indices.is_empty(), - "sampled_indices must not be empty" - ); - assert!(k >= 2, "k must be >= 2 (use gradient for k=1)"); - assert!(k <= 20, "k must be <= 20 (k! overflows f64 for k > 20)"); - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - - let order = k + 1; - let _guard = TaylorDynGuard::::new(order); - - let mut k_factorial = F::one(); - for i in 2..=k { - k_factorial = k_factorial * F::from(i).unwrap(); - } - - let nf = F::from(n).unwrap(); - - let mut value = F::zero(); - let mut acc = WelfordAccumulator::new(); - let mut buf: Vec> = Vec::new(); - - for &j in sampled_indices { - assert!(j < n, "sampled index {} out of bounds (n={})", j, n); - - let inputs: Vec> = (0..n) - .map(|i| { - let mut coeffs = vec![F::zero(); order]; - coeffs[0] = x[i]; - if i == j { - coeffs[1] = F::one(); - } - TaylorDyn::from_coeffs(&coeffs) - }) - .collect(); - - tape.forward_tangent(&inputs, &mut buf); - - let out_coeffs = buf[tape.output_index()].coeffs(); - value = out_coeffs[0]; - // Per-sample: k! * coeffs[k] (the diagonal entry for coordinate j) - acc.update(k_factorial * out_coeffs[k]); - } - - let (mean, sample_variance, standard_error) = acc.finalize(); - - // Unbiased estimator for Σ_j d_j: n * mean(sampled d_j's) - EstimatorResult { - value, - estimate: mean * nf, - sample_variance, - standard_error, - num_samples: sampled_indices.len(), - } -} - -// ══════════════════════════════════════════════ -// TaylorDyn variants (runtime order) -// ══════════════════════════════════════════════ - -/// Propagate direction `v` through tape using `TaylorDyn` with the given order. -/// -/// Creates a `TaylorDynGuard` internally, builds `TaylorDyn` inputs from -/// `(x, v)` with coefficients `[x_i, v_i, 0, ..., 0]`, runs `forward_tangent`, -/// and returns the full coefficient vector of the output. -/// -/// # Panics -/// -/// Panics if `x.len()` or `v.len()` does not match `tape.num_inputs()`, -/// or if `order < 2`. -pub fn taylor_jet_dyn( - tape: &BytecodeTape, - x: &[F], - v: &[F], - order: usize, -) -> Vec { - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - assert_eq!(v.len(), n, "v.len() must match tape.num_inputs()"); - assert!(order >= 2, "order must be >= 2"); - - let _guard = TaylorDynGuard::::new(order); - - let inputs: Vec> = x - .iter() - .zip(v.iter()) - .map(|(&xi, &vi)| { - let mut coeffs = vec![F::zero(); order]; - coeffs[0] = xi; - coeffs[1] = vi; - TaylorDyn::from_coeffs(&coeffs) - }) - .collect(); - - let mut buf = Vec::new(); - tape.forward_tangent(&inputs, &mut buf); - - buf[tape.output_index()].coeffs() -} - -/// Estimate the Laplacian via `TaylorDyn` (runtime-determined order). -/// -/// Uses order 3 (coefficients c0, c1, c2) internally. Manages its own -/// arena guard. -/// -/// Returns `(value, laplacian_estimate)`. -/// -/// # Panics -/// -/// Panics if `directions` is empty or any direction's length does not match -/// `tape.num_inputs()`. -pub fn laplacian_dyn( - tape: &BytecodeTape, - x: &[F], - directions: &[&[F]], -) -> (F, F) { - assert!(!directions.is_empty(), "directions must not be empty"); - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - - let _guard = TaylorDynGuard::::new(3); - - let two = F::from(2.0).unwrap(); - let s = F::from(directions.len()).unwrap(); - let mut sum = F::zero(); - let mut value = F::zero(); - let mut buf: Vec> = Vec::new(); - - for v in directions { - assert_eq!(v.len(), n, "direction length must match tape.num_inputs()"); - - let inputs: Vec> = x - .iter() - .zip(v.iter()) - .map(|(&xi, &vi)| TaylorDyn::from_coeffs(&[xi, vi, F::zero()])) - .collect(); - - tape.forward_tangent(&inputs, &mut buf); - - let out = buf[tape.output_index()]; - let coeffs = out.coeffs(); - value = coeffs[0]; - let c2 = coeffs[2]; - sum = sum + two * c2; - } - - (value, sum / s) -} - -// ══════════════════════════════════════════════ -// Hutch++ trace estimator -// ══════════════════════════════════════════════ - -/// Modified Gram-Schmidt orthonormalisation, in-place. -/// -/// Orthonormalises the columns of the matrix represented as `columns: &mut Vec>`. -/// Drops near-zero columns (norm < epsilon). Returns the rank (number of retained columns). -fn modified_gram_schmidt(columns: &mut Vec>, epsilon: F) -> usize { - let mut rank = 0; - let mut i = 0; - while i < columns.len() { - // Orthogonalise against all previously accepted columns - for j in 0..rank { - // Split to satisfy the borrow checker: j < rank <= i - let (left, right) = columns.split_at_mut(i); - let qj = &left[j]; - let ci = &mut right[0]; - let dot: F = qj - .iter() - .zip(ci.iter()) - .fold(F::zero(), |acc, (&a, &b)| acc + a * b); - for (c, &q) in ci.iter_mut().zip(qj.iter()) { - *c = *c - dot * q; - } - } - - // Compute norm - let norm_sq: F = columns[i].iter().fold(F::zero(), |acc, &v| acc + v * v); - let norm = norm_sq.sqrt(); - - if norm < epsilon { - // Drop this column (near-zero after projection) - columns.swap_remove(i); - // Don't increment i — swapped element needs processing - } else { - // Normalise - let inv_norm = F::one() / norm; - for v in columns[i].iter_mut() { - *v = *v * inv_norm; - } - // Move to rank position - if i != rank { - columns.swap(i, rank); - } - rank += 1; - i += 1; - } - } - columns.truncate(rank); - rank -} - -/// Hutch++ trace estimator (Meyer et al. 2021) for the Laplacian. -/// -/// Achieves O(1/S²) convergence for matrices with decaying eigenvalues by -/// splitting the work into: -/// -/// 1. **Sketch phase**: k HVPs (via `tape.hvp_with_buf`) produce columns of H·S, -/// which are orthonormalised via Modified Gram-Schmidt to give a basis Q. -/// 2. **Exact subspace trace**: For each q_i in Q, `taylor_jet_2nd` gives -/// q_i^T H q_i. Sum = tr(Q^T H Q) — this part has zero variance. -/// 3. **Residual Hutchinson**: For each stochastic direction g_s, project out Q -/// (g' = g - Q(Q^T g)) and estimate the residual trace via `taylor_jet_2nd`. -/// 4. **Total** = exact_trace + residual_mean. -/// -/// The variance and standard error in the result refer to the residual only. -/// -/// # Arguments -/// -/// - `sketch_directions`: k directions for the sketch phase. Typically Rademacher -/// or Gaussian. More directions capture more of the spectrum exactly. -/// - `stochastic_directions`: S directions for residual estimation. Rademacher recommended. -/// -/// # Cost -/// -/// k HVPs (≈2k forward passes) + k Taylor jets (exact subspace) + S Taylor jets -/// (residual) + O(k²·n) for Gram-Schmidt. -/// -/// # Panics -/// -/// Panics if `sketch_directions` or `stochastic_directions` is empty, or if any -/// direction's length does not match `tape.num_inputs()`. -pub fn laplacian_hutchpp( - tape: &BytecodeTape, - x: &[F], - sketch_directions: &[&[F]], - stochastic_directions: &[&[F]], -) -> EstimatorResult { - assert!( - !sketch_directions.is_empty(), - "sketch_directions must not be empty" - ); - assert!( - !stochastic_directions.is_empty(), - "stochastic_directions must not be empty" - ); - - let n = tape.num_inputs(); - let two = F::from(2.0).unwrap(); - let eps = F::from(1e-12).unwrap(); - - // ── Step 1: Sketch — k HVPs to get columns of H·S ── - let mut dual_vals_buf = Vec::new(); - let mut adjoint_buf = Vec::new(); - let mut hs_columns: Vec> = Vec::with_capacity(sketch_directions.len()); - - for s in sketch_directions { - assert_eq!( - s.len(), - n, - "sketch direction length must match tape.num_inputs()" - ); - let (_grad, hvp) = tape.hvp_with_buf(x, s, &mut dual_vals_buf, &mut adjoint_buf); - hs_columns.push(hvp); - } - - // ── Step 2: QR via Modified Gram-Schmidt ── - let rank = modified_gram_schmidt(&mut hs_columns, eps); - let q = &hs_columns; // q[0..rank] are orthonormal basis vectors - - // ── Step 3: Exact subspace trace ── - let mut taylor_buf = Vec::new(); - let mut value = F::zero(); - let mut exact_trace = F::zero(); - - for qi in q.iter().take(rank) { - let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, qi, &mut taylor_buf); - value = c0; - exact_trace = exact_trace + two * c2; // q_i^T H q_i - } - - // ── Step 4: Residual Hutchinson ── - // For each stochastic direction g, project out Q: g' = g - Q(Q^T g) - let mut acc = WelfordAccumulator::new(); - let mut projected = vec![F::zero(); n]; - - for g in stochastic_directions.iter() { - assert_eq!( - g.len(), - n, - "stochastic direction length must match tape.num_inputs()" - ); - - // g' = g - Q(Q^T g) - projected.copy_from_slice(g); - for qi in q.iter().take(rank) { - let dot: F = qi - .iter() - .zip(g.iter()) - .fold(F::zero(), |acc, (&a, &b)| acc + a * b); - for (p, &qv) in projected.iter_mut().zip(qi.iter()) { - *p = *p - dot * qv; - } - } - - let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, &projected, &mut taylor_buf); - value = c0; - acc.update(two * c2); - } - - let (residual_mean, sample_variance, standard_error) = acc.finalize(); - - EstimatorResult { - value, - estimate: exact_trace + residual_mean, - sample_variance, - standard_error, - num_samples: stochastic_directions.len(), - } -} - -// ══════════════════════════════════════════════ -// Divergence estimator -// ══════════════════════════════════════════════ - -/// Estimate the divergence (trace of Jacobian) of a vector field f: R^n → R^n. -/// -/// Uses Hutchinson's trace estimator with first-order forward-mode AD (`Dual`). -/// The tape must be recorded via [`record_multi`](crate::record_multi) with -/// `num_inputs == num_outputs`. -/// -/// For each random direction v with `E[vv^T] = I`: -/// 1. Seed `Dual` inputs: `Dual(x_i, v_i)` -/// 2. Forward pass: `tape.forward_tangent(&dual_inputs, &mut buf)` -/// 3. Sample = Σ_i v_i * buf\[out_indices\[i\]\].eps = v^T J v -/// -/// The mean of these samples converges to tr(J) = div(f). -/// -/// # Panics -/// -/// Panics if `directions` is empty, if `tape.num_outputs() != tape.num_inputs()`, -/// or if any direction's length does not match `tape.num_inputs()`. -pub fn divergence( - tape: &BytecodeTape, - x: &[F], - directions: &[&[F]], -) -> DivergenceResult { - assert!(!directions.is_empty(), "directions must not be empty"); - let n = tape.num_inputs(); - let m = tape.num_outputs(); - assert_eq!( - m, n, - "divergence requires num_outputs ({}) == num_inputs ({})", - m, n - ); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - - let out_indices = tape.all_output_indices(); - let mut buf: Vec> = Vec::new(); - let mut values = vec![F::zero(); m]; - let mut acc = WelfordAccumulator::new(); - - for (k, v) in directions.iter().enumerate() { - assert_eq!(v.len(), n, "direction length must match tape.num_inputs()"); - - // Build Dual inputs - let dual_inputs: Vec> = x - .iter() - .zip(v.iter()) - .map(|(&xi, &vi)| Dual::new(xi, vi)) - .collect(); - - tape.forward_tangent(&dual_inputs, &mut buf); - - // Extract function values (from first direction only is fine — all give same primal) - if k == 0 { - for (i, &oi) in out_indices.iter().enumerate() { - values[i] = buf[oi as usize].re; - } - } - - // sample = v^T J v = Σ_i v_i * (J v)_i - let sample: F = v - .iter() - .zip(out_indices.iter()) - .fold(F::zero(), |acc, (&vi, &oi)| acc + vi * buf[oi as usize].eps); - - acc.update(sample); - } - - let (estimate, sample_variance, standard_error) = acc.finalize(); - - DivergenceResult { - values, - estimate, - sample_variance, - standard_error, - num_samples: directions.len(), - } -} - -// ══════════════════════════════════════════════ -// Parabolic PDE σ-transform -// ══════════════════════════════════════════════ - -/// Estimate the diffusion term `½ tr(σσ^T · Hess u)` for parabolic PDEs. -/// -/// Uses the σ-transform: `½ tr(σσ^T H) = ½ Σ_i (σ·e_i)^T H (σ·e_i)` -/// where `σ·e_i` are the columns of σ (paper Eq 19, Appendix E). -/// -/// Each column pushforward gives `(σ·e_i)^T H (σ·e_i)` via the second-order -/// Taylor coefficient: `2 * c2`. The diffusion term is `½ * Σ 2*c2 = Σ c2`. -/// -/// # Arguments -/// -/// - `sigma_columns`: the columns of σ as slices, each of length `tape.num_inputs()`. -/// -/// Returns `(value, diffusion_term)`. -/// -/// # Panics -/// -/// Panics if `sigma_columns` is empty or any column's length does not match -/// `tape.num_inputs()`. -pub fn parabolic_diffusion( - tape: &BytecodeTape, - x: &[F], - sigma_columns: &[&[F]], -) -> (F, F) { - assert!(!sigma_columns.is_empty(), "sigma_columns must not be empty"); - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - - let half = F::from(0.5).unwrap(); - let two = F::from(2.0).unwrap(); - let mut buf = Vec::new(); - let mut value = F::zero(); - let mut sum = F::zero(); - - for col in sigma_columns { - assert_eq!( - col.len(), - n, - "sigma column length must match tape.num_inputs()" - ); - let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, col, &mut buf); - value = c0; - sum = sum + two * c2; // (σ·e_i)^T H (σ·e_i) - } - - (value, half * sum) -} - -/// Stochastic version of [`parabolic_diffusion`]: subsample column indices. -/// -/// Estimate = `(d / |J|) · ½ · Σ_{i∈J} (σ·e_i)^T H (σ·e_i)`. -/// -/// # Panics -/// -/// Panics if `sampled_indices` is empty or any index is out of bounds. -pub fn parabolic_diffusion_stochastic( - tape: &BytecodeTape, - x: &[F], - sigma_columns: &[&[F]], - sampled_indices: &[usize], -) -> EstimatorResult { - assert!( - !sampled_indices.is_empty(), - "sampled_indices must not be empty" - ); - let n = tape.num_inputs(); - let d = sigma_columns.len(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - - let two = F::from(2.0).unwrap(); - let half = F::from(0.5).unwrap(); - let df = F::from(d).unwrap(); - - let mut buf = Vec::new(); - let mut value = F::zero(); - let mut acc = WelfordAccumulator::new(); - - for &i in sampled_indices { - assert!(i < d, "sampled index {} out of bounds (d={})", i, d); - let col = sigma_columns[i]; - assert_eq!( - col.len(), - n, - "sigma column length must match tape.num_inputs()" - ); - let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, col, &mut buf); - value = c0; - acc.update(two * c2); // (σ·e_i)^T H (σ·e_i) - } - - let (mean, sample_variance, standard_error) = acc.finalize(); - - // Unbiased estimator for ½ Σ_i (σ·e_i)^T H (σ·e_i): d * mean * ½ - EstimatorResult { - value, - estimate: mean * df * half, - sample_variance, - standard_error, - num_samples: sampled_indices.len(), - } -} - -// ══════════════════════════════════════════════ -// Dense STDE for positive-definite operators -// ══════════════════════════════════════════════ - -/// Dense STDE for a positive-definite 2nd-order operator. -/// -/// Given a Cholesky factor `L` (lower-triangular) such that `C = L L^T`, -/// and standard Gaussian vectors `z_s ~ N(0, I)`, this computes -/// `v_s = L · z_s` internally and estimates -/// `tr(C · H_u(x)) = Σ_{ij} C_{ij} ∂²u/∂x_i∂x_j`. -/// -/// The key insight: if `E[v v^T] = C`, then `E[v^T H v] = tr(C · H)`. -/// With `v = L · z` where `z ~ N(0, I)`, we get `E[vv^T] = L L^T = C`. -/// -/// The caller provides `z_vectors` (standard Gaussian samples) and -/// `cholesky_rows` (the rows of L — only lower-triangular entries matter). -/// -/// **Indefinite C deferred**: For indefinite operators, manually split into -/// `C⁺ - C⁻`, compute Cholesky factors for each, and call twice. -/// -/// **No `rand` dependency**: callers provide z_vectors. -/// -/// # Panics -/// -/// Panics if `z_vectors` is empty, `cholesky_rows.len()` does not match -/// `tape.num_inputs()`, or any vector has the wrong length. -pub fn dense_stde_2nd( - tape: &BytecodeTape, - x: &[F], - cholesky_rows: &[&[F]], - z_vectors: &[&[F]], -) -> EstimatorResult { - assert!(!z_vectors.is_empty(), "z_vectors must not be empty"); - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - assert_eq!( - cholesky_rows.len(), - n, - "cholesky_rows.len() must match tape.num_inputs()" - ); - - let two = F::from(2.0).unwrap(); - let mut buf = Vec::new(); - let mut v = vec![F::zero(); n]; - let mut value = F::zero(); - let mut acc = WelfordAccumulator::new(); - - for z in z_vectors.iter() { - assert_eq!(z.len(), n, "z_vector length must match tape.num_inputs()"); - - // Compute v = L · z (lower-triangular mat-vec) - for i in 0..n { - let row = cholesky_rows[i]; - let mut sum = F::zero(); - // Only use lower-triangular entries: j <= i - for j in 0..=i { - sum = sum + row[j] * z[j]; - } - v[i] = sum; - } - - let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, &v, &mut buf); - value = c0; - // 2 * c2 = v^T H v, and E[v^T H v] = tr(C · H) when E[vv^T] = C - acc.update(two * c2); - } - - let (estimate, sample_variance, standard_error) = acc.finalize(); - - EstimatorResult { - value, - estimate, - sample_variance, - standard_error, - num_samples: z_vectors.len(), - } -} - -// ══════════════════════════════════════════════ -// Sparse STDE (requires stde + diffop) -// ══════════════════════════════════════════════ - -#[cfg(feature = "diffop")] -use crate::diffop::SparseSamplingDistribution; - -/// Sparse STDE: estimate `Lu(x)` by sampling sparse k-jets. -/// -/// Each sample index corresponds to an entry in `dist`. For each sample, -/// one forward pushforward of a sparse k-jet is performed. -/// -/// The per-sample estimator is: `sign(C_α) · Z · prefactor · coeffs[k]` -/// where `Z = dist.normalization()` (paper Eq 13). -/// -/// Paper reference: Eq 13, Section 4.3. -/// -/// # Panics -/// -/// Panics if `sampled_indices` is empty or any index is out of bounds. -#[cfg(feature = "diffop")] -pub fn stde_sparse( - tape: &BytecodeTape, - x: &[F], - dist: &SparseSamplingDistribution, - sampled_indices: &[usize], -) -> EstimatorResult { - assert!( - !sampled_indices.is_empty(), - "sampled_indices must not be empty" - ); - let n = tape.num_inputs(); - assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); - - let order = dist.jet_order() + 1; - let z = dist.normalization(); - let _guard = TaylorDynGuard::::new(order); - - let mut value = F::zero(); - let mut acc = WelfordAccumulator::new(); - let mut buf: Vec> = Vec::new(); - - for &idx in sampled_indices { - let entry = dist.entry(idx); - - // Build TaylorDyn inputs: coeffs[0] = x[i] for all, then set - // coeffs[slot] = 1/slot! for active variables per entry.input_coeffs - let inputs: Vec> = (0..n) - .map(|i| { - let mut coeffs = vec![F::zero(); order]; - coeffs[0] = x[i]; - for &(var, slot, inv_fact) in entry.input_coeffs() { - if var == i && slot < order { - coeffs[slot] = inv_fact; - } - } - TaylorDyn::from_coeffs(&coeffs) - }) - .collect(); - - tape.forward_tangent(&inputs, &mut buf); - - let out_coeffs = buf[tape.output_index()].coeffs(); - value = out_coeffs[0]; - - // Extract: raw = coeffs[output_coeff_index] * extraction_prefactor - let raw = out_coeffs[entry.output_coeff_index()] * entry.extraction_prefactor(); - // Sample = sign * Z * raw - let sample = entry.sign() * z * raw; - acc.update(sample); - } - - let (estimate, sample_variance, standard_error) = acc.finalize(); - - EstimatorResult { - value, - estimate, - sample_variance, - standard_error, - num_samples: sampled_indices.len(), - } -} diff --git a/src/stde/estimator.rs b/src/stde/estimator.rs new file mode 100644 index 0000000..b62ae5a --- /dev/null +++ b/src/stde/estimator.rs @@ -0,0 +1,45 @@ +use crate::Float; + +/// Trait for stochastic estimators that combine Taylor jet coefficients into +/// a per-direction sample. +/// +/// Given the Taylor coefficients `(c0, c1, c2)` from propagating a random +/// direction through a tape, an `Estimator` produces a scalar sample whose +/// expectation (over random directions with `E[vv^T] = I`) equals the +/// desired quantity. +pub trait Estimator { + /// Compute one sample from the Taylor jet coefficients. + /// + /// - `c0` = f(x) + /// - `c1` = ∇f(x)·v (directional first derivative) + /// - `c2` = v^T H v / 2 (half directional second derivative) + fn sample(&self, c0: F, c1: F, c2: F) -> F; +} + +/// Hutchinson trace estimator: estimates tr(H) = Laplacian. +/// +/// Each sample is `2 * c2 = v^T H v`. Since `E[v^T H v] = tr(H)` when +/// `E[vv^T] = I`, the mean of these samples converges to the Laplacian. +pub struct Laplacian; + +impl Estimator for Laplacian { + #[inline] + fn sample(&self, _c0: F, _c1: F, c2: F) -> F { + F::from(2.0).unwrap() * c2 + } +} + +/// Estimates `||∇f||²` (squared gradient norm). +/// +/// Each sample is `c1² = (∇f·v)²`. Since `E[(∇f·v)²] = ∇f^T E[vv^T] ∇f = ||∇f||²` +/// when `E[vv^T] = I`, the mean converges to the squared gradient norm. +/// +/// Useful for score matching loss functions where `||∇ log p||²` appears. +pub struct GradientSquaredNorm; + +impl Estimator for GradientSquaredNorm { + #[inline] + fn sample(&self, _c0: F, c1: F, _c2: F) -> F { + c1 * c1 + } +} diff --git a/src/stde/higher_order.rs b/src/stde/higher_order.rs new file mode 100644 index 0000000..3d7e96b --- /dev/null +++ b/src/stde/higher_order.rs @@ -0,0 +1,303 @@ +use super::types::{EstimatorResult, WelfordAccumulator}; +use crate::bytecode_tape::BytecodeTape; +use crate::taylor::Taylor; +use crate::taylor_dyn::{TaylorArenaLocal, TaylorDyn, TaylorDynGuard}; +use crate::Float; + +/// Exact k-th order diagonal: `[∂^k u/∂x_j^k for j in 0..n]`. +/// +/// Pushes n basis vectors through order-(k+1) `TaylorDyn` jets. For each +/// coordinate j, the input jet has `coeffs_j = [x_j, 1, 0, ..., 0]` and all +/// other inputs `coeffs_i = [x_i, 0, ..., 0]`. The output coefficient at +/// index k stores `∂^k u/∂x_j^k / k!`, so the derivative is `k! * coeffs[k]`. +/// +/// Requires `F: Float + TaylorArenaLocal` because the jet order is +/// runtime-determined (unlike [`hessian_diagonal`](super::hessian_diagonal) which uses const-generic +/// `Taylor`). +/// +/// # Panics +/// +/// Panics if `k < 2`, `k > 20` (factorial overflow guard for f64), or if +/// `x.len()` does not match `tape.num_inputs()`. +pub fn diagonal_kth_order( + tape: &BytecodeTape, + x: &[F], + k: usize, +) -> (F, Vec) { + let mut buf = Vec::new(); + diagonal_kth_order_with_buf(tape, x, k, &mut buf) +} + +/// Like [`diagonal_kth_order`] but reuses a caller-provided buffer. +pub fn diagonal_kth_order_with_buf( + tape: &BytecodeTape, + x: &[F], + k: usize, + buf: &mut Vec>, +) -> (F, Vec) { + assert!(k >= 2, "k must be >= 2 (use gradient for k=1)"); + assert!(k <= 20, "k must be <= 20 (k! overflows f64 for k > 20)"); + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + + let order = k + 1; // number of Taylor coefficients + let _guard = TaylorDynGuard::::new(order); + + let mut k_factorial = F::one(); + for i in 2..=k { + k_factorial = k_factorial * F::from(i).unwrap(); + } + + let mut diag = Vec::with_capacity(n); + let mut value = F::zero(); + + for j in 0..n { + // Build TaylorDyn inputs: coeffs_j = [x_j, 1, 0, ..., 0], others = [x_i, 0, ..., 0] + let inputs: Vec> = (0..n) + .map(|i| { + let mut coeffs = vec![F::zero(); order]; + coeffs[0] = x[i]; + if i == j { + coeffs[1] = F::one(); + } + TaylorDyn::from_coeffs(&coeffs) + }) + .collect(); + + tape.forward_tangent(&inputs, buf); + + let out_coeffs = buf[tape.output_index()].coeffs(); + value = out_coeffs[0]; + diag.push(k_factorial * out_coeffs[k]); + } + + (value, diag) +} + +/// Exact k-th order diagonal using const-generic `Taylor`. +/// +/// `ORDER = k + 1` where `k = ORDER - 1` is the derivative order. Uses +/// stack-allocated Taylor coefficients — faster than [`diagonal_kth_order`] +/// when k is known at compile time. +/// +/// # Precision Note +/// +/// For f32, `k! > 2^23` when k ≥ 13 (ORDER ≥ 14), causing precision loss. +/// The existing [`diagonal_kth_order`] has a runtime guard `k ≤ 20` for f64; +/// the const-generic version has no such guard but users should be aware +/// that for f32, ORDER ≤ 14 is the practical limit. +/// +/// # Panics +/// +/// Compile-time error if `ORDER < 3` (i.e., k < 2). +/// Runtime panic if `x.len()` does not match `tape.num_inputs()`. +pub fn diagonal_kth_order_const( + tape: &BytecodeTape, + x: &[F], +) -> (F, Vec) { + let mut buf: Vec> = Vec::new(); + diagonal_kth_order_const_with_buf(tape, x, &mut buf) +} + +/// Like [`diagonal_kth_order_const`] but reuses a caller-provided buffer. +pub fn diagonal_kth_order_const_with_buf( + tape: &BytecodeTape, + x: &[F], + buf: &mut Vec>, +) -> (F, Vec) { + const { assert!(ORDER >= 3, "ORDER must be >= 3 (k=ORDER-1 >= 2)") } + + let k = ORDER - 1; + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + + let mut k_factorial = F::one(); + for i in 2..=k { + k_factorial = k_factorial * F::from(i).unwrap(); + } + + let mut diag = Vec::with_capacity(n); + let mut value = F::zero(); + + for j in 0..n { + let inputs: Vec> = (0..n) + .map(|i| { + let mut coeffs = [F::zero(); ORDER]; + coeffs[0] = x[i]; + if i == j { + coeffs[1] = F::one(); + } + Taylor::new(coeffs) + }) + .collect(); + + tape.forward_tangent(&inputs, buf); + + let out = buf[tape.output_index()]; + value = out.coeffs[0]; + diag.push(k_factorial * out.coeffs[k]); + } + + (value, diag) +} + +/// Stochastic k-th order diagonal estimate: `Σ_j ∂^k u/∂x_j^k`. +/// +/// Evaluates only the coordinate indices in `sampled_indices`, then scales +/// by `n / |J|` to produce an unbiased estimate of the full sum. +/// +/// # Panics +/// +/// Panics if `sampled_indices` is empty, `k < 2`, `k > 20`, or if +/// `x.len()` does not match `tape.num_inputs()`. +pub fn diagonal_kth_order_stochastic( + tape: &BytecodeTape, + x: &[F], + k: usize, + sampled_indices: &[usize], +) -> EstimatorResult { + assert!( + !sampled_indices.is_empty(), + "sampled_indices must not be empty" + ); + assert!(k >= 2, "k must be >= 2 (use gradient for k=1)"); + assert!(k <= 20, "k must be <= 20 (k! overflows f64 for k > 20)"); + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + + let order = k + 1; + let _guard = TaylorDynGuard::::new(order); + + let mut k_factorial = F::one(); + for i in 2..=k { + k_factorial = k_factorial * F::from(i).unwrap(); + } + + let nf = F::from(n).unwrap(); + + let mut value = F::zero(); + let mut acc = WelfordAccumulator::new(); + let mut buf: Vec> = Vec::new(); + + for &j in sampled_indices { + assert!(j < n, "sampled index {} out of bounds (n={})", j, n); + + let inputs: Vec> = (0..n) + .map(|i| { + let mut coeffs = vec![F::zero(); order]; + coeffs[0] = x[i]; + if i == j { + coeffs[1] = F::one(); + } + TaylorDyn::from_coeffs(&coeffs) + }) + .collect(); + + tape.forward_tangent(&inputs, &mut buf); + + let out_coeffs = buf[tape.output_index()].coeffs(); + value = out_coeffs[0]; + // Per-sample: k! * coeffs[k] (the diagonal entry for coordinate j) + acc.update(k_factorial * out_coeffs[k]); + } + + let (mean, sample_variance, standard_error) = acc.finalize(); + + // Unbiased estimator for Σ_j d_j: n * mean(sampled d_j's) + EstimatorResult { + value, + estimate: mean * nf, + sample_variance, + standard_error, + num_samples: sampled_indices.len(), + } +} + +/// Propagate direction `v` through tape using `TaylorDyn` with the given order. +/// +/// Creates a `TaylorDynGuard` internally, builds `TaylorDyn` inputs from +/// `(x, v)` with coefficients `[x_i, v_i, 0, ..., 0]`, runs `forward_tangent`, +/// and returns the full coefficient vector of the output. +/// +/// # Panics +/// +/// Panics if `x.len()` or `v.len()` does not match `tape.num_inputs()`, +/// or if `order < 2`. +pub fn taylor_jet_dyn( + tape: &BytecodeTape, + x: &[F], + v: &[F], + order: usize, +) -> Vec { + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + assert_eq!(v.len(), n, "v.len() must match tape.num_inputs()"); + assert!(order >= 2, "order must be >= 2"); + + let _guard = TaylorDynGuard::::new(order); + + let inputs: Vec> = x + .iter() + .zip(v.iter()) + .map(|(&xi, &vi)| { + let mut coeffs = vec![F::zero(); order]; + coeffs[0] = xi; + coeffs[1] = vi; + TaylorDyn::from_coeffs(&coeffs) + }) + .collect(); + + let mut buf = Vec::new(); + tape.forward_tangent(&inputs, &mut buf); + + buf[tape.output_index()].coeffs() +} + +/// Estimate the Laplacian via `TaylorDyn` (runtime-determined order). +/// +/// Uses order 3 (coefficients c0, c1, c2) internally. Manages its own +/// arena guard. +/// +/// Returns `(value, laplacian_estimate)`. +/// +/// # Panics +/// +/// Panics if `directions` is empty or any direction's length does not match +/// `tape.num_inputs()`. +pub fn laplacian_dyn( + tape: &BytecodeTape, + x: &[F], + directions: &[&[F]], +) -> (F, F) { + assert!(!directions.is_empty(), "directions must not be empty"); + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + + let _guard = TaylorDynGuard::::new(3); + + let two = F::from(2.0).unwrap(); + let s = F::from(directions.len()).unwrap(); + let mut sum = F::zero(); + let mut value = F::zero(); + let mut buf: Vec> = Vec::new(); + + for v in directions { + assert_eq!(v.len(), n, "direction length must match tape.num_inputs()"); + + let inputs: Vec> = x + .iter() + .zip(v.iter()) + .map(|(&xi, &vi)| TaylorDyn::from_coeffs(&[xi, vi, F::zero()])) + .collect(); + + tape.forward_tangent(&inputs, &mut buf); + + let out = buf[tape.output_index()]; + let coeffs = out.coeffs(); + value = coeffs[0]; + let c2 = coeffs[2]; + sum = sum + two * c2; + } + + (value, sum / s) +} diff --git a/src/stde/jet.rs b/src/stde/jet.rs new file mode 100644 index 0000000..8998b6e --- /dev/null +++ b/src/stde/jet.rs @@ -0,0 +1,76 @@ +use crate::bytecode_tape::BytecodeTape; +use crate::taylor::Taylor; +use crate::Float; + +/// Propagate direction `v` through tape using second-order Taylor mode. +/// +/// Constructs `Taylor` inputs where `input[i] = [x[i], v[i], 0]`, +/// runs `forward_tangent`, and extracts the output coefficients. +/// +/// Returns `(f(x), nabla_f . v, v^T H v / 2)`. +/// +/// # Panics +/// +/// Panics if `x.len()` or `v.len()` does not match `tape.num_inputs()`. +pub fn taylor_jet_2nd(tape: &BytecodeTape, x: &[F], v: &[F]) -> (F, F, F) { + let mut buf = Vec::new(); + taylor_jet_2nd_with_buf(tape, x, v, &mut buf) +} + +/// Like [`taylor_jet_2nd`] but reuses a caller-provided buffer to avoid +/// reallocation across multiple calls. +/// +/// # Panics +/// +/// Panics if `x.len()` or `v.len()` does not match `tape.num_inputs()`. +pub fn taylor_jet_2nd_with_buf( + tape: &BytecodeTape, + x: &[F], + v: &[F], + buf: &mut Vec>, +) -> (F, F, F) { + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + assert_eq!(v.len(), n, "v.len() must match tape.num_inputs()"); + + let inputs: Vec> = x + .iter() + .zip(v.iter()) + .map(|(&xi, &vi)| Taylor::new([xi, vi, F::zero()])) + .collect(); + + tape.forward_tangent(&inputs, buf); + + let out = buf[tape.output_index()]; + (out.coeffs[0], out.coeffs[1], out.coeffs[2]) +} + +/// Evaluate multiple directions through the tape. +/// +/// Returns `(value, first_order, second_order)` where: +/// - `value` = f(x) +/// - `first_order[s]` = nabla_f . v_s (directional first derivative) +/// - `second_order[s]` = v_s^T H v_s / 2 (half directional second derivative) +/// +/// # Panics +/// +/// Panics if any direction's length does not match `tape.num_inputs()`. +pub fn directional_derivatives( + tape: &BytecodeTape, + x: &[F], + directions: &[&[F]], +) -> (F, Vec, Vec) { + let mut buf = Vec::new(); + let mut first_order = Vec::with_capacity(directions.len()); + let mut second_order = Vec::with_capacity(directions.len()); + let mut value = F::zero(); + + for v in directions { + let (c0, c1, c2) = taylor_jet_2nd_with_buf(tape, x, v, &mut buf); + value = c0; + first_order.push(c1); + second_order.push(c2); + } + + (value, first_order, second_order) +} diff --git a/src/stde/laplacian.rs b/src/stde/laplacian.rs new file mode 100644 index 0000000..a50aabe --- /dev/null +++ b/src/stde/laplacian.rs @@ -0,0 +1,342 @@ +use super::estimator::Laplacian; +use super::jet::{directional_derivatives, taylor_jet_2nd_with_buf}; +use super::pipeline::estimate; +use super::types::{EstimatorResult, WelfordAccumulator}; +use crate::bytecode_tape::BytecodeTape; +use crate::taylor::Taylor; +use crate::Float; + +/// Estimate the Laplacian (trace of Hessian) via Hutchinson's trace estimator. +/// +/// Directions must satisfy E[vv^T] = I (e.g. Rademacher vectors with entries +/// +/-1, or standard Gaussian vectors). The estimator is: +/// +/// Laplacian ~ (1/S) * sum_s 2*c2_s +/// +/// where c2_s is the second Taylor coefficient for direction s. +/// +/// Returns `(value, laplacian_estimate)`. +/// +/// Note: coordinate basis vectors do **not** satisfy E[vv^T] = I and will +/// give tr(H)/n instead of tr(H). Use [`hessian_diagonal`] and sum for exact +/// computation via coordinate directions. +/// +/// # Panics +/// +/// Panics if `directions` is empty or any direction's length does not match +/// `tape.num_inputs()`. +pub fn laplacian(tape: &BytecodeTape, x: &[F], directions: &[&[F]]) -> (F, F) { + assert!(!directions.is_empty(), "directions must not be empty"); + + let (value, _, second_order) = directional_derivatives(tape, x, directions); + + let two = F::from(2.0).unwrap(); + let s = F::from(directions.len()).unwrap(); + let sum: F = second_order + .iter() + .fold(F::zero(), |acc, &c2| acc + two * c2); + let laplacian = sum / s; + + (value, laplacian) +} + +/// Estimate the Laplacian with sample statistics via Hutchinson's trace estimator. +/// +/// Same estimator as [`laplacian`], but additionally computes sample variance +/// and standard error using Welford's online algorithm (numerically stable, +/// single pass). +/// +/// Each direction produces a sample `2 * c2_s`. The returned statistics +/// describe the distribution of these samples. +/// +/// # Panics +/// +/// Panics if `directions` is empty or any direction's length does not match +/// `tape.num_inputs()`. +pub fn laplacian_with_stats( + tape: &BytecodeTape, + x: &[F], + directions: &[&[F]], +) -> EstimatorResult { + estimate(&Laplacian, tape, x, directions) +} + +/// Estimate the Laplacian with a diagonal control variate. +/// +/// Uses the exact Hessian diagonal (from [`hessian_diagonal`]) as a control +/// variate to reduce estimator variance. Each raw Hutchinson sample +/// `raw_s = v^T H v = 2 * c2_s` is adjusted: +/// +/// ```text +/// adjusted_s = raw_s - sum_j(D_jj * v_j^2) + tr(D) +/// ``` +/// +/// where D is the diagonal of H. The adjustment subtracts the noisy diagonal +/// contribution and adds back its exact expectation `tr(D)`. +/// +/// **Effect by distribution**: +/// - **Gaussian**: reduces variance from `2||H||_F^2` to `2 sum_{i≠j} H_ij^2` +/// (matching Rademacher performance). +/// - **Rademacher**: no effect, since `v_j^2 = 1` always, so the adjustment +/// is `sum_j D_jj * 1 - tr(D) = 0`. +/// +/// Returns an [`EstimatorResult`] with statistics computed over the adjusted +/// samples. +/// +/// # Panics +/// +/// Panics if `directions` is empty, if any direction's length does not match +/// `tape.num_inputs()`, or if `control_diagonal.len() != tape.num_inputs()`. +pub fn laplacian_with_control( + tape: &BytecodeTape, + x: &[F], + directions: &[&[F]], + control_diagonal: &[F], +) -> EstimatorResult { + assert!(!directions.is_empty(), "directions must not be empty"); + let n = tape.num_inputs(); + assert_eq!( + control_diagonal.len(), + n, + "control_diagonal.len() must match tape.num_inputs()" + ); + + let two = F::from(2.0).unwrap(); + let trace_control: F = control_diagonal + .iter() + .copied() + .fold(F::zero(), |a, b| a + b); + + let mut buf = Vec::new(); + let mut value = F::zero(); + let mut acc = WelfordAccumulator::new(); + + for v in directions.iter() { + let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, v, &mut buf); + value = c0; + + let raw = two * c2; + + // Control variate: subtract v^T D v, add back E[v^T D v] = tr(D) + let cv: F = control_diagonal + .iter() + .zip(v.iter()) + .fold(F::zero(), |acc, (&d, &vi)| acc + d * vi * vi); + acc.update(raw - cv + trace_control); + } + + let (estimate, sample_variance, standard_error) = acc.finalize(); + + EstimatorResult { + value, + estimate, + sample_variance, + standard_error, + num_samples: directions.len(), + } +} + +/// Exact Hessian diagonal via n coordinate-direction evaluations. +/// +/// For each coordinate j, pushes basis vector e_j through the tape and +/// reads `2 * c2`, which equals `d^2 f / dx_j^2`. +/// +/// Returns `(value, diag)` where `diag[j] = d^2 f / dx_j^2`. +pub fn hessian_diagonal(tape: &BytecodeTape, x: &[F]) -> (F, Vec) { + let mut buf = Vec::new(); + hessian_diagonal_with_buf(tape, x, &mut buf) +} + +/// Like [`hessian_diagonal`] but reuses a caller-provided buffer. +pub fn hessian_diagonal_with_buf( + tape: &BytecodeTape, + x: &[F], + buf: &mut Vec>, +) -> (F, Vec) { + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + + let two = F::from(2.0).unwrap(); + let mut diag = Vec::with_capacity(n); + let mut value = F::zero(); + + // Build basis vector once, mutate the hot coordinate + let mut e = vec![F::zero(); n]; + for j in 0..n { + e[j] = F::one(); + let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, &e, buf); + value = c0; + diag.push(two * c2); + e[j] = F::zero(); + } + + (value, diag) +} + +/// Modified Gram-Schmidt orthonormalisation, in-place. +/// +/// Orthonormalises the columns of the matrix represented as `columns: &mut Vec>`. +/// Drops near-zero columns (norm < epsilon). Returns the rank (number of retained columns). +fn modified_gram_schmidt(columns: &mut Vec>, epsilon: F) -> usize { + let mut rank = 0; + let mut i = 0; + while i < columns.len() { + // Orthogonalise against all previously accepted columns + for j in 0..rank { + // Split to satisfy the borrow checker: j < rank <= i + let (left, right) = columns.split_at_mut(i); + let qj = &left[j]; + let ci = &mut right[0]; + let dot: F = qj + .iter() + .zip(ci.iter()) + .fold(F::zero(), |acc, (&a, &b)| acc + a * b); + for (c, &q) in ci.iter_mut().zip(qj.iter()) { + *c = *c - dot * q; + } + } + + // Compute norm + let norm_sq: F = columns[i].iter().fold(F::zero(), |acc, &v| acc + v * v); + let norm = norm_sq.sqrt(); + + if norm < epsilon { + // Drop this column (near-zero after projection) + columns.swap_remove(i); + // Don't increment i — swapped element needs processing + } else { + // Normalise + let inv_norm = F::one() / norm; + for v in columns[i].iter_mut() { + *v = *v * inv_norm; + } + // Move to rank position + if i != rank { + columns.swap(i, rank); + } + rank += 1; + i += 1; + } + } + columns.truncate(rank); + rank +} + +/// Hutch++ trace estimator (Meyer et al. 2021) for the Laplacian. +/// +/// Achieves O(1/S²) convergence for matrices with decaying eigenvalues by +/// splitting the work into: +/// +/// 1. **Sketch phase**: k HVPs (via `tape.hvp_with_buf`) produce columns of H·S, +/// which are orthonormalised via Modified Gram-Schmidt to give a basis Q. +/// 2. **Exact subspace trace**: For each q_i in Q, `taylor_jet_2nd` gives +/// q_i^T H q_i. Sum = tr(Q^T H Q) — this part has zero variance. +/// 3. **Residual Hutchinson**: For each stochastic direction g_s, project out Q +/// (g' = g - Q(Q^T g)) and estimate the residual trace via `taylor_jet_2nd`. +/// 4. **Total** = exact_trace + residual_mean. +/// +/// The variance and standard error in the result refer to the residual only. +/// +/// # Arguments +/// +/// - `sketch_directions`: k directions for the sketch phase. Typically Rademacher +/// or Gaussian. More directions capture more of the spectrum exactly. +/// - `stochastic_directions`: S directions for residual estimation. Rademacher recommended. +/// +/// # Cost +/// +/// k HVPs (≈2k forward passes) + k Taylor jets (exact subspace) + S Taylor jets +/// (residual) + O(k²·n) for Gram-Schmidt. +/// +/// # Panics +/// +/// Panics if `sketch_directions` or `stochastic_directions` is empty, or if any +/// direction's length does not match `tape.num_inputs()`. +pub fn laplacian_hutchpp( + tape: &BytecodeTape, + x: &[F], + sketch_directions: &[&[F]], + stochastic_directions: &[&[F]], +) -> EstimatorResult { + assert!( + !sketch_directions.is_empty(), + "sketch_directions must not be empty" + ); + assert!( + !stochastic_directions.is_empty(), + "stochastic_directions must not be empty" + ); + + let n = tape.num_inputs(); + let two = F::from(2.0).unwrap(); + let eps = F::from(1e-12).unwrap(); + + // ── Step 1: Sketch — k HVPs to get columns of H·S ── + let mut dual_vals_buf = Vec::new(); + let mut adjoint_buf = Vec::new(); + let mut hs_columns: Vec> = Vec::with_capacity(sketch_directions.len()); + + for s in sketch_directions { + assert_eq!( + s.len(), + n, + "sketch direction length must match tape.num_inputs()" + ); + let (_grad, hvp) = tape.hvp_with_buf(x, s, &mut dual_vals_buf, &mut adjoint_buf); + hs_columns.push(hvp); + } + + // ── Step 2: QR via Modified Gram-Schmidt ── + let rank = modified_gram_schmidt(&mut hs_columns, eps); + let q = &hs_columns; // q[0..rank] are orthonormal basis vectors + + // ── Step 3: Exact subspace trace ── + let mut taylor_buf = Vec::new(); + let mut value = F::zero(); + let mut exact_trace = F::zero(); + + for qi in q.iter().take(rank) { + let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, qi, &mut taylor_buf); + value = c0; + exact_trace = exact_trace + two * c2; // q_i^T H q_i + } + + // ── Step 4: Residual Hutchinson ── + // For each stochastic direction g, project out Q: g' = g - Q(Q^T g) + let mut acc = WelfordAccumulator::new(); + let mut projected = vec![F::zero(); n]; + + for g in stochastic_directions.iter() { + assert_eq!( + g.len(), + n, + "stochastic direction length must match tape.num_inputs()" + ); + + // g' = g - Q(Q^T g) + projected.copy_from_slice(g); + for qi in q.iter().take(rank) { + let dot: F = qi + .iter() + .zip(g.iter()) + .fold(F::zero(), |acc, (&a, &b)| acc + a * b); + for (p, &qv) in projected.iter_mut().zip(qi.iter()) { + *p = *p - dot * qv; + } + } + + let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, &projected, &mut taylor_buf); + value = c0; + acc.update(two * c2); + } + + let (residual_mean, sample_variance, standard_error) = acc.finalize(); + + EstimatorResult { + value, + estimate: exact_trace + residual_mean, + sample_variance, + standard_error, + num_samples: stochastic_directions.len(), + } +} diff --git a/src/stde/mod.rs b/src/stde/mod.rs new file mode 100644 index 0000000..5147594 --- /dev/null +++ b/src/stde/mod.rs @@ -0,0 +1,117 @@ +//! Stochastic Taylor Derivative Estimators (STDE). +//! +//! Estimate differential operators (Laplacian, Hessian diagonal, directional +//! derivatives) by pushing random direction vectors through Taylor-mode AD. +//! +//! # How it works +//! +//! For f: R^n -> R at point x, define g(t) = f(x + t*v) where v is a +//! direction vector. The Taylor coefficients of g at t=0 are: +//! +//! - c0 = f(x) +//! - c1 = nabla f(x) . v (directional first derivative) +//! - c2 = v^T H_f(x) v / 2 (half the directional second derivative) +//! +//! By choosing v appropriately (Rademacher, Gaussian, coordinate basis), +//! we can estimate operators like the Laplacian in O(S*K*L) time instead +//! of O(n^2*L) for the full Hessian. +//! +//! # Variance properties +//! +//! The Hutchinson estimator `(1/S) sum_s v_s^T H v_s` is unbiased when +//! E\[vv^T\] = I. Its variance depends on the distribution of v: +//! +//! - **Rademacher** (entries ±1): Var = `2 sum_{i≠j} H_ij^2`. The diagonal +//! contributes zero variance since `v_i^2 = 1` always. +//! - **Gaussian** (v ~ N(0,I)): Var = `2 ||H||_F^2`. Higher variance than +//! Rademacher because `v_i^2 ~ chi-squared(1)` introduces diagonal noise. +//! +//! The [`laplacian_with_control`] function reduces Gaussian variance to match +//! Rademacher by subtracting the exact diagonal contribution (a control +//! variate). This requires the diagonal from [`hessian_diagonal`] (n extra +//! evaluations). For Rademacher directions, the control variate has no effect +//! since the diagonal variance is already zero. +//! +//! **Antithetic sampling** (pairing +v with -v) does **not** reduce variance +//! for trace estimation. The Hutchinson sample `v^T H v` is quadratic (even) +//! in v, so `(-v)^T H (-v) = v^T H v` — antithetic pairs are identical. +//! +//! # Design +//! +//! - **No `rand` dependency**: all functions accept user-provided direction +//! vectors. The library stays pure; users bring their own RNG. +//! - **`Taylor`** for second-order operators: stack-allocated, Copy, +//! monomorphized. The order K=3 is statically known. +//! - **`TaylorDyn`** variants for runtime-determined order. +//! - **Panics on misuse**: dimension mismatches panic, following existing +//! API conventions (`record`, `grad`, `hvp`). +//! +//! # Const-Generic Higher-Order Diagonal +//! +//! [`diagonal_kth_order_const`] is a stack-allocated variant of +//! [`diagonal_kth_order`] for compile-time-known derivative order. It uses +//! `Taylor` directly (no `TaylorDyn` arena), which is faster when +//! the order is statically known. `ORDER = k + 1` where k is the derivative +//! order. For f32, practical limit is `ORDER ≤ 14` (k ≤ 13) since `k! > 2^23` +//! causes precision loss. +//! +//! # Higher-Order Estimation +//! +//! [`diagonal_kth_order`] generalises [`hessian_diagonal`] from k=2 to +//! arbitrary k, computing exact `[∂^k u/∂x_j^k]` for all coordinates via +//! `TaylorDyn` jets of order k+1. The stochastic variant +//! [`diagonal_kth_order_stochastic`] subsamples coordinates for an unbiased +//! estimate of `Σ_j ∂^k u/∂x_j^k`. +//! +//! # Dense STDE for Positive-Definite Operators +//! +//! [`dense_stde_2nd`] estimates `tr(C · H_u(x)) = Σ_{ij} C_{ij} ∂²u/∂x_i∂x_j` +//! for a positive-definite coefficient matrix C. The caller provides a Cholesky +//! factor L (such that `C = L L^T`) and standard Gaussian vectors z. Internally, +//! `v = L · z` is computed, then pushed through a second-order Taylor jet. When +//! `L = I`, this reduces to the Hutchinson Laplacian estimator. +//! +//! # Parabolic PDE σ-Transform +//! +//! [`parabolic_diffusion`] computes `½ tr(σσ^T · Hess u)` for parabolic PDEs +//! (Fokker-Planck, Black-Scholes, HJB) by pushing columns of σ through +//! second-order Taylor jets. This avoids forming the off-diagonal Hessian +//! entries that a naïve `tr(A·H)` computation would require. +//! +//! # Sparse STDE (requires `stde` + `diffop`) +//! +//! `stde_sparse` estimates arbitrary differential operators `Lu(x)` by +//! sampling sparse k-jets from a `SparseSamplingDistribution` built via +//! `DiffOp::sparse_distribution`. Each sample is a single forward +//! pushforward; the per-sample estimator is `sign(C_α) · Z · D^α u` where +//! Z = Σ|C_α| is the normalization constant. This implements the core +//! contribution of Shi et al. (NeurIPS 2024). + +mod estimator; +mod higher_order; +mod jet; +mod laplacian; +mod pde; +mod pipeline; +#[cfg(feature = "diffop")] +mod sparse; +mod types; + +pub use estimator::{Estimator, GradientSquaredNorm, Laplacian}; +pub use higher_order::{ + diagonal_kth_order, diagonal_kth_order_const, diagonal_kth_order_const_with_buf, + diagonal_kth_order_stochastic, diagonal_kth_order_with_buf, laplacian_dyn, taylor_jet_dyn, +}; +pub use jet::{directional_derivatives, taylor_jet_2nd, taylor_jet_2nd_with_buf}; +pub use laplacian::{ + hessian_diagonal, hessian_diagonal_with_buf, laplacian, laplacian_hutchpp, + laplacian_with_control, laplacian_with_stats, +}; +pub use pde::{ + dense_stde_2nd, divergence, parabolic_diffusion, parabolic_diffusion_stochastic, +}; +pub use pipeline::{estimate, estimate_weighted}; +pub use types::{DivergenceResult, EstimatorResult}; + +#[cfg(feature = "diffop")] +pub use sparse::stde_sparse; diff --git a/src/stde/pde.rs b/src/stde/pde.rs new file mode 100644 index 0000000..e9bfc0d --- /dev/null +++ b/src/stde/pde.rs @@ -0,0 +1,256 @@ +use super::jet::taylor_jet_2nd_with_buf; +use super::types::{DivergenceResult, EstimatorResult, WelfordAccumulator}; +use crate::bytecode_tape::BytecodeTape; +use crate::dual::Dual; +use crate::Float; + +/// Estimate the divergence (trace of Jacobian) of a vector field f: R^n → R^n. +/// +/// Uses Hutchinson's trace estimator with first-order forward-mode AD (`Dual`). +/// The tape must be recorded via [`record_multi`](crate::record_multi) with +/// `num_inputs == num_outputs`. +/// +/// For each random direction v with `E[vv^T] = I`: +/// 1. Seed `Dual` inputs: `Dual(x_i, v_i)` +/// 2. Forward pass: `tape.forward_tangent(&dual_inputs, &mut buf)` +/// 3. Sample = Σ_i v_i * buf\[out_indices\[i\]\].eps = v^T J v +/// +/// The mean of these samples converges to tr(J) = div(f). +/// +/// # Panics +/// +/// Panics if `directions` is empty, if `tape.num_outputs() != tape.num_inputs()`, +/// or if any direction's length does not match `tape.num_inputs()`. +pub fn divergence( + tape: &BytecodeTape, + x: &[F], + directions: &[&[F]], +) -> DivergenceResult { + assert!(!directions.is_empty(), "directions must not be empty"); + let n = tape.num_inputs(); + let m = tape.num_outputs(); + assert_eq!( + m, n, + "divergence requires num_outputs ({}) == num_inputs ({})", + m, n + ); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + + let out_indices = tape.all_output_indices(); + let mut buf: Vec> = Vec::new(); + let mut values = vec![F::zero(); m]; + let mut acc = WelfordAccumulator::new(); + + for (k, v) in directions.iter().enumerate() { + assert_eq!(v.len(), n, "direction length must match tape.num_inputs()"); + + // Build Dual inputs + let dual_inputs: Vec> = x + .iter() + .zip(v.iter()) + .map(|(&xi, &vi)| Dual::new(xi, vi)) + .collect(); + + tape.forward_tangent(&dual_inputs, &mut buf); + + // Extract function values (from first direction only is fine — all give same primal) + if k == 0 { + for (i, &oi) in out_indices.iter().enumerate() { + values[i] = buf[oi as usize].re; + } + } + + // sample = v^T J v = Σ_i v_i * (J v)_i + let sample: F = v + .iter() + .zip(out_indices.iter()) + .fold(F::zero(), |acc, (&vi, &oi)| acc + vi * buf[oi as usize].eps); + + acc.update(sample); + } + + let (estimate, sample_variance, standard_error) = acc.finalize(); + + DivergenceResult { + values, + estimate, + sample_variance, + standard_error, + num_samples: directions.len(), + } +} + +/// Estimate the diffusion term `½ tr(σσ^T · Hess u)` for parabolic PDEs. +/// +/// Uses the σ-transform: `½ tr(σσ^T H) = ½ Σ_i (σ·e_i)^T H (σ·e_i)` +/// where `σ·e_i` are the columns of σ (paper Eq 19, Appendix E). +/// +/// Each column pushforward gives `(σ·e_i)^T H (σ·e_i)` via the second-order +/// Taylor coefficient: `2 * c2`. The diffusion term is `½ * Σ 2*c2 = Σ c2`. +/// +/// # Arguments +/// +/// - `sigma_columns`: the columns of σ as slices, each of length `tape.num_inputs()`. +/// +/// Returns `(value, diffusion_term)`. +/// +/// # Panics +/// +/// Panics if `sigma_columns` is empty or any column's length does not match +/// `tape.num_inputs()`. +pub fn parabolic_diffusion( + tape: &BytecodeTape, + x: &[F], + sigma_columns: &[&[F]], +) -> (F, F) { + assert!(!sigma_columns.is_empty(), "sigma_columns must not be empty"); + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + + let half = F::from(0.5).unwrap(); + let two = F::from(2.0).unwrap(); + let mut buf = Vec::new(); + let mut value = F::zero(); + let mut sum = F::zero(); + + for col in sigma_columns { + assert_eq!( + col.len(), + n, + "sigma column length must match tape.num_inputs()" + ); + let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, col, &mut buf); + value = c0; + sum = sum + two * c2; // (σ·e_i)^T H (σ·e_i) + } + + (value, half * sum) +} + +/// Stochastic version of [`parabolic_diffusion`]: subsample column indices. +/// +/// Estimate = `(d / |J|) · ½ · Σ_{i∈J} (σ·e_i)^T H (σ·e_i)`. +/// +/// # Panics +/// +/// Panics if `sampled_indices` is empty or any index is out of bounds. +pub fn parabolic_diffusion_stochastic( + tape: &BytecodeTape, + x: &[F], + sigma_columns: &[&[F]], + sampled_indices: &[usize], +) -> EstimatorResult { + assert!( + !sampled_indices.is_empty(), + "sampled_indices must not be empty" + ); + let n = tape.num_inputs(); + let d = sigma_columns.len(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + + let two = F::from(2.0).unwrap(); + let half = F::from(0.5).unwrap(); + let df = F::from(d).unwrap(); + + let mut buf = Vec::new(); + let mut value = F::zero(); + let mut acc = WelfordAccumulator::new(); + + for &i in sampled_indices { + assert!(i < d, "sampled index {} out of bounds (d={})", i, d); + let col = sigma_columns[i]; + assert_eq!( + col.len(), + n, + "sigma column length must match tape.num_inputs()" + ); + let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, col, &mut buf); + value = c0; + acc.update(two * c2); // (σ·e_i)^T H (σ·e_i) + } + + let (mean, sample_variance, standard_error) = acc.finalize(); + + // Unbiased estimator for ½ Σ_i (σ·e_i)^T H (σ·e_i): d * mean * ½ + EstimatorResult { + value, + estimate: mean * df * half, + sample_variance, + standard_error, + num_samples: sampled_indices.len(), + } +} + +/// Dense STDE for a positive-definite 2nd-order operator. +/// +/// Given a Cholesky factor `L` (lower-triangular) such that `C = L L^T`, +/// and standard Gaussian vectors `z_s ~ N(0, I)`, this computes +/// `v_s = L · z_s` internally and estimates +/// `tr(C · H_u(x)) = Σ_{ij} C_{ij} ∂²u/∂x_i∂x_j`. +/// +/// The key insight: if `E[v v^T] = C`, then `E[v^T H v] = tr(C · H)`. +/// With `v = L · z` where `z ~ N(0, I)`, we get `E[vv^T] = L L^T = C`. +/// +/// The caller provides `z_vectors` (standard Gaussian samples) and +/// `cholesky_rows` (the rows of L — only lower-triangular entries matter). +/// +/// **Indefinite C deferred**: For indefinite operators, manually split into +/// `C⁺ - C⁻`, compute Cholesky factors for each, and call twice. +/// +/// **No `rand` dependency**: callers provide z_vectors. +/// +/// # Panics +/// +/// Panics if `z_vectors` is empty, `cholesky_rows.len()` does not match +/// `tape.num_inputs()`, or any vector has the wrong length. +pub fn dense_stde_2nd( + tape: &BytecodeTape, + x: &[F], + cholesky_rows: &[&[F]], + z_vectors: &[&[F]], +) -> EstimatorResult { + assert!(!z_vectors.is_empty(), "z_vectors must not be empty"); + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + assert_eq!( + cholesky_rows.len(), + n, + "cholesky_rows.len() must match tape.num_inputs()" + ); + + let two = F::from(2.0).unwrap(); + let mut buf = Vec::new(); + let mut v = vec![F::zero(); n]; + let mut value = F::zero(); + let mut acc = WelfordAccumulator::new(); + + for z in z_vectors.iter() { + assert_eq!(z.len(), n, "z_vector length must match tape.num_inputs()"); + + // Compute v = L · z (lower-triangular mat-vec) + for i in 0..n { + let row = cholesky_rows[i]; + let mut sum = F::zero(); + // Only use lower-triangular entries: j <= i + for j in 0..=i { + sum = sum + row[j] * z[j]; + } + v[i] = sum; + } + + let (c0, _, c2) = taylor_jet_2nd_with_buf(tape, x, &v, &mut buf); + value = c0; + // 2 * c2 = v^T H v, and E[v^T H v] = tr(C · H) when E[vv^T] = C + acc.update(two * c2); + } + + let (estimate, sample_variance, standard_error) = acc.finalize(); + + EstimatorResult { + value, + estimate, + sample_variance, + standard_error, + num_samples: z_vectors.len(), + } +} diff --git a/src/stde/pipeline.rs b/src/stde/pipeline.rs new file mode 100644 index 0000000..fadc528 --- /dev/null +++ b/src/stde/pipeline.rs @@ -0,0 +1,110 @@ +use super::estimator::Estimator; +use super::jet::taylor_jet_2nd_with_buf; +use super::types::{EstimatorResult, WelfordAccumulator}; +use crate::bytecode_tape::BytecodeTape; +use crate::Float; + +/// Estimate a quantity using the given [`Estimator`] and Welford's online algorithm. +/// +/// Evaluates the tape at `x` for each direction, computes the estimator's sample +/// from the Taylor jet, and aggregates with running mean and variance. +/// +/// # Panics +/// +/// Panics if `directions` is empty or any direction's length does not match +/// `tape.num_inputs()`. +pub fn estimate( + estimator: &impl Estimator, + tape: &BytecodeTape, + x: &[F], + directions: &[&[F]], +) -> EstimatorResult { + assert!(!directions.is_empty(), "directions must not be empty"); + + let mut buf = Vec::new(); + let mut value = F::zero(); + let mut acc = WelfordAccumulator::new(); + + for v in directions.iter() { + let (c0, c1, c2) = taylor_jet_2nd_with_buf(tape, x, v, &mut buf); + value = c0; + acc.update(estimator.sample(c0, c1, c2)); + } + + let (estimate, sample_variance, standard_error) = acc.finalize(); + + EstimatorResult { + value, + estimate, + sample_variance, + standard_error, + num_samples: directions.len(), + } +} + +/// Estimate a quantity using importance-weighted samples (West's 1979 algorithm). +/// +/// Each direction `directions[s]` has an associated weight `weights[s]`. +/// The weighted mean is `Σ(w_s * sample_s) / Σ(w_s)` and the variance uses +/// the reliability-weight Bessel correction: `M2 / (W - W2/W)` where +/// `W = Σw_s` and `W2 = Σw_s²`. +/// +/// # Panics +/// +/// Panics if `directions` is empty, `weights.len() != directions.len()`, +/// or any direction's length does not match `tape.num_inputs()`. +pub fn estimate_weighted( + estimator: &impl Estimator, + tape: &BytecodeTape, + x: &[F], + directions: &[&[F]], + weights: &[F], +) -> EstimatorResult { + assert!(!directions.is_empty(), "directions must not be empty"); + assert_eq!( + weights.len(), + directions.len(), + "weights.len() must match directions.len()" + ); + + let mut buf = Vec::new(); + let mut value = F::zero(); + + // West's (1979) weighted online algorithm + let mut w_sum = F::zero(); + let mut w_sum2 = F::zero(); + let mut mean = F::zero(); + let mut m2 = F::zero(); + + for (k, v) in directions.iter().enumerate() { + let (c0, c1, c2) = taylor_jet_2nd_with_buf(tape, x, v, &mut buf); + value = c0; + let s = estimator.sample(c0, c1, c2); + let w = weights[k]; + + w_sum = w_sum + w; + w_sum2 = w_sum2 + w * w; + let delta = s - mean; + mean = mean + (w / w_sum) * delta; + let delta2 = s - mean; + m2 = m2 + w * delta * delta2; + } + + let n = directions.len(); + let denom = w_sum - w_sum2 / w_sum; + let (sample_variance, standard_error) = if n > 1 && denom > F::zero() { + let var = m2 / denom; + let nf = F::from(n).unwrap(); + (var, (var / nf).sqrt()) + } else { + (F::zero(), F::zero()) + }; + + EstimatorResult { + value, + estimate: mean, + sample_variance, + standard_error, + num_samples: n, + } +} diff --git a/src/stde/sparse.rs b/src/stde/sparse.rs new file mode 100644 index 0000000..bd83022 --- /dev/null +++ b/src/stde/sparse.rs @@ -0,0 +1,80 @@ +use super::types::{EstimatorResult, WelfordAccumulator}; +use crate::bytecode_tape::BytecodeTape; +use crate::diffop::SparseSamplingDistribution; +use crate::taylor_dyn::{TaylorArenaLocal, TaylorDyn, TaylorDynGuard}; +use crate::Float; + +/// Sparse STDE: estimate `Lu(x)` by sampling sparse k-jets. +/// +/// Each sample index corresponds to an entry in `dist`. For each sample, +/// one forward pushforward of a sparse k-jet is performed. +/// +/// The per-sample estimator is: `sign(C_α) · Z · prefactor · coeffs[k]` +/// where `Z = dist.normalization()` (paper Eq 13). +/// +/// Paper reference: Eq 13, Section 4.3. +/// +/// # Panics +/// +/// Panics if `sampled_indices` is empty or any index is out of bounds. +pub fn stde_sparse( + tape: &BytecodeTape, + x: &[F], + dist: &SparseSamplingDistribution, + sampled_indices: &[usize], +) -> EstimatorResult { + assert!( + !sampled_indices.is_empty(), + "sampled_indices must not be empty" + ); + let n = tape.num_inputs(); + assert_eq!(x.len(), n, "x.len() must match tape.num_inputs()"); + + let order = dist.jet_order() + 1; + let z = dist.normalization(); + let _guard = TaylorDynGuard::::new(order); + + let mut value = F::zero(); + let mut acc = WelfordAccumulator::new(); + let mut buf: Vec> = Vec::new(); + + for &idx in sampled_indices { + let entry = dist.entry(idx); + + // Build TaylorDyn inputs: coeffs[0] = x[i] for all, then set + // coeffs[slot] = 1/slot! for active variables per entry.input_coeffs + let inputs: Vec> = (0..n) + .map(|i| { + let mut coeffs = vec![F::zero(); order]; + coeffs[0] = x[i]; + for &(var, slot, inv_fact) in entry.input_coeffs() { + if var == i && slot < order { + coeffs[slot] = inv_fact; + } + } + TaylorDyn::from_coeffs(&coeffs) + }) + .collect(); + + tape.forward_tangent(&inputs, &mut buf); + + let out_coeffs = buf[tape.output_index()].coeffs(); + value = out_coeffs[0]; + + // Extract: raw = coeffs[output_coeff_index] * extraction_prefactor + let raw = out_coeffs[entry.output_coeff_index()] * entry.extraction_prefactor(); + // Sample = sign * Z * raw + let sample = entry.sign() * z * raw; + acc.update(sample); + } + + let (estimate, sample_variance, standard_error) = acc.finalize(); + + EstimatorResult { + value, + estimate, + sample_variance, + standard_error, + num_samples: sampled_indices.len(), + } +} diff --git a/src/stde/types.rs b/src/stde/types.rs new file mode 100644 index 0000000..3a180d6 --- /dev/null +++ b/src/stde/types.rs @@ -0,0 +1,75 @@ +use crate::Float; + +/// Result of a stochastic estimation with sample statistics. +/// +/// Contains the function value, the estimated operator value, and +/// sample statistics (variance, standard error) that quantify +/// estimator quality. +#[derive(Clone, Debug)] +pub struct EstimatorResult { + /// Function value f(x). + pub value: F, + /// Estimated operator value (e.g. Laplacian). + pub estimate: F, + /// Sample variance of the per-direction estimates. + /// Zero when `num_samples == 1` (undefined, clamped to zero). + pub sample_variance: F, + /// Standard error of the mean: `sqrt(sample_variance / num_samples)`. + pub standard_error: F, + /// Number of direction samples used. + pub num_samples: usize, +} + +/// Result of a divergence estimation. +/// +/// Separate from [`EstimatorResult`] because the function output is a vector +/// (`values: Vec`) rather than a scalar (`value: F`). +#[derive(Clone, Debug)] +pub struct DivergenceResult { + /// Function output vector f(x). + pub values: Vec, + /// Estimated divergence (trace of the Jacobian). + pub estimate: F, + /// Sample variance of per-direction estimates. + pub sample_variance: F, + /// Standard error of the mean. + pub standard_error: F, + /// Number of direction samples used. + pub num_samples: usize, +} + +/// Welford's online algorithm for incremental mean and variance. +pub(super) struct WelfordAccumulator { + mean: F, + m2: F, + count: usize, +} + +impl WelfordAccumulator { + pub(super) fn new() -> Self { + Self { + mean: F::zero(), + m2: F::zero(), + count: 0, + } + } + + pub(super) fn update(&mut self, sample: F) { + self.count += 1; + let k1 = F::from(self.count).unwrap(); + let delta = sample - self.mean; + self.mean = self.mean + delta / k1; + let delta2 = sample - self.mean; + self.m2 = self.m2 + delta * delta2; + } + + pub(super) fn finalize(&self) -> (F, F, F) { + let nf = F::from(self.count).unwrap(); + if self.count > 1 { + let var = self.m2 / (nf - F::one()); + (self.mean, var, (var / nf).sqrt()) + } else { + (self.mean, F::zero(), F::zero()) + } + } +} diff --git a/src/tape.rs b/src/tape.rs index c257e65..4f74281 100644 --- a/src/tape.rs +++ b/src/tape.rs @@ -39,6 +39,7 @@ impl Default for Tape { impl Tape { /// Create an empty tape. + #[must_use] pub fn new() -> Self { let mut tape = Tape { statements: Vec::new(), @@ -56,6 +57,7 @@ impl Tape { } /// Create a tape with pre-allocated capacity. + #[must_use] pub fn with_capacity(est_ops: usize) -> Self { let mut tape = Tape { statements: Vec::with_capacity(est_ops + 1), @@ -135,6 +137,7 @@ impl Tape { /// Run the reverse sweep, seeding the adjoint of `seed_index` with 1. /// Returns the full adjoint vector. + #[must_use] pub fn reverse(&self, seed_index: u32) -> Vec { let mut adjoints = vec![F::zero(); self.num_variables as usize]; adjoints[seed_index as usize] = F::one(); diff --git a/src/taylor_dyn.rs b/src/taylor_dyn.rs index 7931366..bfc827c 100644 --- a/src/taylor_dyn.rs +++ b/src/taylor_dyn.rs @@ -32,6 +32,7 @@ pub struct TaylorArena { impl TaylorArena { /// Create a new arena with the given degree. + #[must_use] pub fn new(degree: usize) -> Self { TaylorArena { data: Vec::new(), @@ -42,6 +43,7 @@ impl TaylorArena { /// Number of coefficients per entry. #[inline] + #[must_use] pub fn degree(&self) -> usize { self.degree } @@ -58,6 +60,7 @@ impl TaylorArena { /// Get the coefficient slice for entry `index`. #[inline] + #[must_use] pub fn coeffs(&self, index: u32) -> &[F] { let start = index as usize * self.degree; &self.data[start..start + self.degree] @@ -113,6 +116,10 @@ pub fn with_active_arena(f: impl FnOnce(&mut TaylorArena !ptr.is_null(), "No active Taylor arena. Create a TaylorDynGuard first." ); + // SAFETY: The pointer is non-null (asserted above) and was set by a + // `TaylorDynGuard` that owns the `Box>` and keeps it alive + // for the guard's lifetime. The thread-local cell ensures single-threaded + // access, so no aliasing occurs. let arena = unsafe { &mut *ptr }; f(arena) }) @@ -130,6 +137,7 @@ pub struct TaylorDynGuard { impl TaylorDynGuard { /// Create and activate a Taylor arena with the given `degree` /// (number of Taylor coefficients per variable). + #[must_use] pub fn new(degree: usize) -> Self { let mut arena = Box::new(TaylorArena::new(degree)); let prev = F::cell().with(|cell| { @@ -141,6 +149,7 @@ impl TaylorDynGuard { } /// Access the underlying arena. + #[must_use] pub fn arena(&self) -> &TaylorArena { &self.arena } diff --git a/src/traits/laurent_std_ops.rs b/src/traits/laurent_std_ops.rs index a237a55..bf837aa 100644 --- a/src/traits/laurent_std_ops.rs +++ b/src/traits/laurent_std_ops.rs @@ -75,7 +75,9 @@ impl Sub for Laurent { } } -// Laurent Mul delegates to taylor_ops::taylor_mul (Cauchy product) which involves addition +// Truncated Laurent series multiplication uses the Cauchy product, which accumulates +// terms via addition — clippy flags the + inside a Mul impl, but this is correct for +// power-series coefficient propagation. #[allow(clippy::suspicious_arithmetic_impl)] impl Mul for Laurent { type Output = Self; @@ -87,7 +89,9 @@ impl Mul for Laurent { } } -// Laurent Div delegates to taylor_ops::taylor_div which involves multiplication internally +// Truncated Laurent series division computes coefficients via recurrence that uses +// multiplication internally — clippy flags the * inside a Div impl, but this is correct +// for power-series coefficient propagation. #[allow(clippy::suspicious_arithmetic_impl)] impl Div for Laurent { type Output = Self; @@ -208,7 +212,9 @@ macro_rules! impl_laurent_scalar_ops { } } - // Scalar Div delegates to Laurent Div (self / Laurent::constant(rhs)) to reuse pole arithmetic + // Scalar division delegates to full Laurent Div to reuse pole-order arithmetic. + // Clippy flags the / inside a Div impl (self-referential dispatch), but this is + // intentional — the scalar is promoted to a Laurent series first. #[allow(clippy::suspicious_arithmetic_impl)] impl Div<$f> for Laurent<$f, K> { type Output = Laurent<$f, K>; diff --git a/src/traits/simba_impls.rs b/src/traits/simba_impls.rs index 60f2d36..1ccd7ad 100644 --- a/src/traits/simba_impls.rs +++ b/src/traits/simba_impls.rs @@ -30,6 +30,8 @@ impl SimdValue for Dual { *self } #[inline(always)] + // SAFETY: This is a single-lane (LANES=1) scalar type, so the lane index + // is always 0 and the operation is trivially safe regardless of input. unsafe fn extract_unchecked(&self, _: usize) -> Self::Element { *self } @@ -38,6 +40,8 @@ impl SimdValue for Dual { *self = val; } #[inline(always)] + // SAFETY: This is a single-lane (LANES=1) scalar type, so the lane index + // is always 0 and the operation is trivially safe regardless of input. unsafe fn replace_unchecked(&mut self, _: usize, val: Self::Element) { *self = val; } @@ -67,6 +71,8 @@ impl SimdValue for Reverse { *self } #[inline(always)] + // SAFETY: This is a single-lane (LANES=1) scalar type, so the lane index + // is always 0 and the operation is trivially safe regardless of input. unsafe fn extract_unchecked(&self, _: usize) -> Self::Element { *self } @@ -75,6 +81,8 @@ impl SimdValue for Reverse { *self = val; } #[inline(always)] + // SAFETY: This is a single-lane (LANES=1) scalar type, so the lane index + // is always 0 and the operation is trivially safe regardless of input. unsafe fn replace_unchecked(&mut self, _: usize, val: Self::Element) { *self = val; } diff --git a/src/traits/taylor_std_ops.rs b/src/traits/taylor_std_ops.rs index 35ddd9b..bcf2892 100644 --- a/src/traits/taylor_std_ops.rs +++ b/src/traits/taylor_std_ops.rs @@ -32,7 +32,9 @@ impl Sub for Taylor { } } -// Taylor Mul delegates to taylor_ops::taylor_mul (Cauchy product) which involves addition +// Truncated Taylor series multiplication uses the Cauchy product, which accumulates +// terms via addition — clippy flags the + inside a Mul impl, but this is correct for +// power-series coefficient propagation. #[allow(clippy::suspicious_arithmetic_impl)] impl Mul for Taylor { type Output = Self; @@ -44,7 +46,9 @@ impl Mul for Taylor { } } -// Taylor Div delegates to taylor_ops::taylor_div which involves multiplication internally +// Truncated Taylor series division computes coefficients via recurrence that uses +// multiplication internally — clippy flags the * inside a Div impl, but this is correct +// for power-series coefficient propagation. #[allow(clippy::suspicious_arithmetic_impl)] impl Div for Taylor { type Output = Self; @@ -186,7 +190,8 @@ macro_rules! impl_taylor_scalar_ops { } } - // Scalar Div uses Mul internally (multiply by reciprocal for efficiency) + // Scalar division multiplies by the reciprocal for efficiency — clippy flags + // the * inside a Div impl, but this is the standard reciprocal-multiply optimization. #[allow(clippy::suspicious_arithmetic_impl)] impl Div<$f> for Taylor<$f, K> { type Output = Taylor<$f, K>; @@ -268,7 +273,9 @@ impl Sub for TaylorDyn { } } -// TaylorDyn Mul delegates to taylor_ops::taylor_mul (Cauchy product) which involves addition +// Truncated Taylor series multiplication uses the Cauchy product, which accumulates +// terms via addition — clippy flags the + inside a Mul impl, but this is correct for +// power-series coefficient propagation. #[allow(clippy::suspicious_arithmetic_impl)] impl Mul for TaylorDyn { type Output = Self; @@ -278,7 +285,9 @@ impl Mul for TaylorDyn { } } -// TaylorDyn Div delegates to taylor_ops::taylor_div which involves multiplication internally +// Truncated Taylor series division computes coefficients via recurrence that uses +// multiplication internally — clippy flags the * inside a Div impl, but this is correct +// for power-series coefficient propagation. #[allow(clippy::suspicious_arithmetic_impl)] impl Div for TaylorDyn { type Output = Self; From 9cbf26827adaa73d524ea01300a7c14b4c763b9d Mon Sep 17 00:00:00 2001 From: Greg von Nessi Date: Sat, 14 Mar 2026 16:47:13 +0000 Subject: [PATCH 2/2] Fix rustfmt formatting in stde/mod.rs re-export --- src/stde/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/stde/mod.rs b/src/stde/mod.rs index 5147594..315dc40 100644 --- a/src/stde/mod.rs +++ b/src/stde/mod.rs @@ -107,9 +107,7 @@ pub use laplacian::{ hessian_diagonal, hessian_diagonal_with_buf, laplacian, laplacian_hutchpp, laplacian_with_control, laplacian_with_stats, }; -pub use pde::{ - dense_stde_2nd, divergence, parabolic_diffusion, parabolic_diffusion_stochastic, -}; +pub use pde::{dense_stde_2nd, divergence, parabolic_diffusion, parabolic_diffusion_stochastic}; pub use pipeline::{estimate, estimate_weighted}; pub use types::{DivergenceResult, EstimatorResult};