diff --git a/CHANGELOG.md b/CHANGELOG.md index 054d493..7f881fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.4.1] - 2026-03-14 + +### Fixed + +- **Powi f32 exponent encoding**: `powi(n)` on f32 bytecode tapes silently produced wrong values and gradients for negative exponents (`n <= -2`). The `i32` exponent was stored as `u32` then round-tripped through `f32`, which loses precision for values > 2^24 (all negative exponents). All 5 dispatch sites (forward, reverse, tangent forward, tangent reverse, cross-country) now decode the exponent directly from the raw `u32` via `powi_exp_decode_raw`, bypassing the float conversion entirely. +- **taylor_powi negative base**: `Taylor::powi` and bytecode Taylor-mode produced NaN for negative base values (e.g. `(-2)^3`) because the implementation used `exp(n * ln(a))` which fails for `ln(negative)`. Added `taylor_powi_squaring` using binary exponentiation with `taylor_mul`, dispatched when `a[0] < 0` or `|n| <= 8`. +- **Checkpoint position lookup**: `grad_checkpointed`, `grad_checkpointed_disk`, and `grad_checkpointed_with_hints` used `Vec::contains()` for checkpoint position lookups (O(n) per step). Converted to `HashSet` for O(1) lookups. +- **Nonsmooth Round kink detection**: `forward_nonsmooth` now correctly detects Round kinks at half-integers (0.5, 1.5, ...) instead of at integers, matching the actual discontinuity locations of the `round` function. Updated test to match. + ## [0.4.0] - 2026-02-26 ### Changed @@ -175,7 +184,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Forward-vs-reverse cross-validation on Rosenbrock, Beale, Ackley, Booth, and more - Criterion benchmarks for forward overhead and reverse gradient -[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.4.0...HEAD +[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.4.1...HEAD +[0.4.1]: https://github.com/Entrolution/echidna/compare/v0.4.0...v0.4.1 [0.4.0]: https://github.com/Entrolution/echidna/compare/v0.3.0...v0.4.0 [0.3.0]: https://github.com/Entrolution/echidna/compare/v0.2.0...v0.3.0 [0.2.0]: https://github.com/Entrolution/echidna/compare/v0.1.0...v0.2.0 diff --git a/Cargo.toml b/Cargo.toml index eaf5666..5dd77b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = [".", "echidna-optim"] [package] name = "echidna" -version = "0.4.0" +version = "0.4.1" edition = "2021" rust-version = "1.93" description = "A high-performance automatic differentiation library for Rust" diff --git a/SECURITY.md b/SECURITY.md index 63c0c2f..f48d129 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,10 +4,10 @@ | Version | Supported | |---------|-----------| -| 0.4.x | Yes | -| < 0.4 | No | +| >= 0.4.1 | Yes | +| < 0.4.1 | No | -Only the latest minor release receives security updates. +Only the latest patch release receives security updates. Versions prior to 0.4.1 have known correctness bugs (silent wrong results from `powi` on f32 tapes, NaN from `Taylor::powi` with negative base). ## Reporting a Vulnerability diff --git a/echidna-optim/Cargo.toml b/echidna-optim/Cargo.toml index 267f5ba..f458803 100644 --- a/echidna-optim/Cargo.toml +++ b/echidna-optim/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "echidna-optim" -version = "0.4.0" +version = "0.4.1" edition = "2021" rust-version = "1.93" description = "Optimization solvers and implicit differentiation for echidna" @@ -12,7 +12,7 @@ keywords = ["optimization", "differentiation", "implicit", "gradient", "scientif categories = ["mathematics", "science", "algorithms"] [dependencies] -echidna = { version = "0.4.0", path = "..", features = ["bytecode"] } +echidna = { version = "0.4.1", path = "..", features = ["bytecode"] } num-traits = "0.2" faer = { version = "0.24", optional = true } diff --git a/echidna-optim/src/solvers/lbfgs.rs b/echidna-optim/src/solvers/lbfgs.rs index 452ffed..966edb1 100644 --- a/echidna-optim/src/solvers/lbfgs.rs +++ b/echidna-optim/src/solvers/lbfgs.rs @@ -51,9 +51,9 @@ pub fn lbfgs>( if config.memory == 0 || config.convergence.max_iter == 0 { return OptimResult { x: x0.to_vec(), - value: F::zero(), - gradient: vec![F::zero(); n], - gradient_norm: F::zero(), + value: F::nan(), + gradient: vec![F::nan(); n], + gradient_norm: F::nan(), iterations: 0, func_evals: 0, termination: TerminationReason::NumericalError, diff --git a/echidna-optim/src/solvers/newton.rs b/echidna-optim/src/solvers/newton.rs index f2f19f8..f2636e5 100644 --- a/echidna-optim/src/solvers/newton.rs +++ b/echidna-optim/src/solvers/newton.rs @@ -49,9 +49,9 @@ pub fn newton>( if config.convergence.max_iter == 0 { return OptimResult { x: x0.to_vec(), - value: F::zero(), - gradient: vec![F::zero(); n], - gradient_norm: F::zero(), + value: F::nan(), + gradient: vec![F::nan(); n], + gradient_norm: F::nan(), iterations: 0, func_evals: 0, termination: TerminationReason::NumericalError, diff --git a/echidna-optim/src/solvers/trust_region.rs b/echidna-optim/src/solvers/trust_region.rs index ddbc944..fd415c4 100644 --- a/echidna-optim/src/solvers/trust_region.rs +++ b/echidna-optim/src/solvers/trust_region.rs @@ -62,9 +62,9 @@ pub fn trust_region>( { return OptimResult { x: x0.to_vec(), - value: F::zero(), - gradient: vec![F::zero(); n], - gradient_norm: F::zero(), + value: F::nan(), + gradient: vec![F::nan(); n], + gradient_norm: F::nan(), iterations: 0, func_evals: 0, termination: TerminationReason::NumericalError, diff --git a/src/api.rs b/src/api.rs index 3674a2a..1b30a6e 100644 --- a/src/api.rs +++ b/src/api.rs @@ -43,6 +43,12 @@ pub fn grad( let output = f(&inputs); drop(guard); + // If the output is a constant (independent of all inputs), the gradient is zero. + if output.index == crate::tape::CONSTANT { + Tape::return_to_pool(tape); + return vec![F::zero(); n]; + } + // Run reverse sweep. let adjoints = tape.reverse(output.index); diff --git a/src/bytecode_tape/forward.rs b/src/bytecode_tape/forward.rs index faeff59..b77b1bd 100644 --- a/src/bytecode_tape/forward.rs +++ b/src/bytecode_tape/forward.rs @@ -30,10 +30,13 @@ fn forward_dispatch( op => { let [a_idx, b_idx] = arg_indices[i]; let a = values[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { + if op == OpCode::Powi { + let exp = opcode::powi_exp_decode_raw(b_idx); + values[i] = a.powi(exp); + continue; + } + let b = if b_idx != UNUSED { values[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or_else(|| F::zero()) } else { F::zero() }; @@ -128,7 +131,7 @@ impl super::BytecodeTape { branch: if a >= F::zero() { 1 } else { -1 }, }); } - OpCode::Floor | OpCode::Ceil | OpCode::Round | OpCode::Trunc => { + OpCode::Floor | OpCode::Ceil | OpCode::Trunc => { // Kink at integer values. switching_value = distance to // nearest integer: zero exactly at kink points, works // symmetrically for both approach directions. @@ -143,6 +146,19 @@ impl super::BytecodeTape { }, }); } + OpCode::Round => { + // Round has kinks at half-integers (0.5, 1.5, ...), + // not at integers. Shift by 0.5 to measure distance + // to the nearest half-integer. + let half = F::from(0.5).unwrap(); + let shifted = a + half; + kinks.push(crate::nonsmooth::KinkEntry { + tape_index: i as u32, + opcode: op, + switching_value: shifted - shifted.round(), + branch: if a - a.floor() < half { 1 } else { -1 }, + }); + } _ => unreachable!(), } } diff --git a/src/bytecode_tape/optimize.rs b/src/bytecode_tape/optimize.rs index 9e66e3e..f0725a9 100644 --- a/src/bytecode_tape/optimize.rs +++ b/src/bytecode_tape/optimize.rs @@ -34,7 +34,13 @@ impl super::BytecodeTape { if a != UNUSED { stack.push(a); } - if b != UNUSED && self.opcodes[i] != OpCode::Powi { + if self.opcodes[i] == OpCode::Custom { + // For Custom ops, b is a callback index, not a tape index. + // The actual second operand (for binary custom ops) is in custom_second_args. + if let Some(&second_arg) = self.custom_second_args.get(&(idx)) { + stack.push(second_arg); + } + } else if b != UNUSED && self.opcodes[i] != OpCode::Powi { stack.push(b); } } @@ -62,10 +68,13 @@ impl super::BytecodeTape { } else { UNUSED }; - let rb = if b != UNUSED && self.opcodes[read] != OpCode::Powi { + let rb = if b != UNUSED + && self.opcodes[read] != OpCode::Powi + && self.opcodes[read] != OpCode::Custom + { remap[b as usize] } else { - b + b // Powi: b is encoded exponent; Custom: b is callback index }; self.arg_indices[write] = [ra, rb]; write += 1; @@ -77,6 +86,16 @@ impl super::BytecodeTape { self.values.truncate(new_len); self.num_variables = new_len as u32; + // Remap custom_second_args: both keys and values are tape indices. + if !self.custom_second_args.is_empty() { + self.custom_second_args = self + .custom_second_args + .iter() + .filter(|(&k, _)| reachable[k as usize]) + .map(|(&k, &v)| (remap[k as usize], remap[v as usize])) + .collect(); + } + remap } @@ -97,9 +116,6 @@ impl super::BytecodeTape { /// `output_indices` contains only the active outputs (remapped), and /// `output_index` is set to the first active output. /// - /// **Note**: Like `dead_code_elimination`, this does not remap - /// `custom_second_args` (pre-existing limitation). - /// /// # Panics /// Panics if `active_outputs` is empty. pub fn dead_code_elimination_for_outputs(&mut self, active_outputs: &[u32]) { @@ -142,14 +158,20 @@ impl super::BytecodeTape { } let [mut a, mut b] = self.arg_indices[i]; - // Apply remap to args (except Powi exponent in b). + // Apply remap to args (except Powi exponent and Custom callback in b). a = remap[a as usize]; - if b != UNUSED && op != OpCode::Powi { + if b != UNUSED && op != OpCode::Powi && op != OpCode::Custom { b = remap[b as usize]; } // Update arg_indices with remapped values. self.arg_indices[i] = [a, b]; + // Skip CSE for Custom ops: the key doesn't capture custom_second_args, + // so binary custom ops with different second operands would be incorrectly merged. + if op == OpCode::Custom { + continue; + } + // Build the canonical key. let key = if b == UNUSED { // Unary: hash (op, arg0) only; use UNUSED as placeholder. @@ -176,14 +198,22 @@ impl super::BytecodeTape { } let [a, b] = self.arg_indices[i]; let ra = remap[a as usize]; - let rb = if b != UNUSED && op != OpCode::Powi { + let rb = if b != UNUSED && op != OpCode::Powi && op != OpCode::Custom { remap[b as usize] } else { - b + b // Powi: b is encoded exponent; Custom: b is callback index }; self.arg_indices[i] = [ra, rb]; } + // Remap custom_second_args values (keys are not changed by CSE remap, + // but the second operand indices may have been CSE'd). + if !self.custom_second_args.is_empty() { + for v in self.custom_second_args.values_mut() { + *v = remap[*v as usize]; + } + } + // Update output indices. self.output_index = remap[self.output_index as usize]; for oi in &mut self.output_indices { @@ -198,8 +228,7 @@ impl super::BytecodeTape { /// /// In debug builds, validates internal consistency after optimization. pub fn optimize(&mut self) { - self.cse(); - self.dead_code_elimination(); + self.cse(); // CSE already calls dead_code_elimination() internally. // Validate internal consistency in debug builds. #[cfg(debug_assertions)] diff --git a/src/bytecode_tape/reverse.rs b/src/bytecode_tape/reverse.rs index 0c75cfd..21197df 100644 --- a/src/bytecode_tape/reverse.rs +++ b/src/bytecode_tape/reverse.rs @@ -59,10 +59,15 @@ impl super::BytecodeTape { adjoints[i] = F::zero(); let [a_idx, b_idx] = self.arg_indices[i]; let a = values[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { + if op == OpCode::Powi { + let exp = opcode::powi_exp_decode_raw(b_idx); + let n = F::from(exp).unwrap(); + let da = n * a.powi(exp - 1); + adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj; + continue; + } + let b = if b_idx != UNUSED { values[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or_else(|| F::zero()) } else { F::zero() }; @@ -74,7 +79,7 @@ impl super::BytecodeTape { }; adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj; - if b_idx != UNUSED && op != OpCode::Powi { + if b_idx != UNUSED { adjoints[b_idx as usize] = adjoints[b_idx as usize] + db * adj; } } diff --git a/src/bytecode_tape/tangent.rs b/src/bytecode_tape/tangent.rs index 9f10d48..24e2646 100644 --- a/src/bytecode_tape/tangent.rs +++ b/src/bytecode_tape/tangent.rs @@ -80,10 +80,13 @@ impl super::BytecodeTape { op => { let [a_idx, b_idx] = self.arg_indices[i]; let a = buf[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { + if op == OpCode::Powi { + let exp = opcode::powi_exp_decode_raw(b_idx); + buf[i] = a.powi(exp); + continue; + } + let b = if b_idx != UNUSED { buf[b_idx as usize] - } else if op == OpCode::Powi { - T::from(b_idx).unwrap_or_else(|| T::zero()) } else { T::zero() }; @@ -170,10 +173,15 @@ impl super::BytecodeTape { let [a_idx, b_idx] = self.arg_indices[i]; let a = tangent_vals[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { + if op == OpCode::Powi { + let exp = opcode::powi_exp_decode_raw(b_idx); + let n = T::from(exp).unwrap(); + let da = n * a.powi(exp - 1); + buf[a_idx as usize] = buf[a_idx as usize] + da * adj; + continue; + } + let b = if b_idx != UNUSED { tangent_vals[b_idx as usize] - } else if op == OpCode::Powi { - T::from(b_idx).unwrap_or_else(|| T::zero()) } else { T::zero() }; @@ -181,7 +189,7 @@ impl super::BytecodeTape { let (da, db) = opcode::reverse_partials(op, a, b, r); buf[a_idx as usize] = buf[a_idx as usize] + da * adj; - if b_idx != UNUSED && op != OpCode::Powi { + if b_idx != UNUSED { buf[b_idx as usize] = buf[b_idx as usize] + db * adj; } } diff --git a/src/checkpoint.rs b/src/checkpoint.rs index cc22f12..fadb7f9 100644 --- a/src/checkpoint.rs +++ b/src/checkpoint.rs @@ -4,6 +4,8 @@ //! (Griewank & Walther, 2000) to minimize recomputation for a given //! number of checkpoint slots. +use std::collections::HashSet; + use crate::breverse::BReverse; use crate::bytecode_tape::{BtapeGuard, BtapeThreadLocal, BytecodeTape}; use crate::float::Float; @@ -45,8 +47,11 @@ pub fn grad_checkpointed( let num_checkpoints = num_checkpoints.max(1).min(num_steps); - // Compute optimal checkpoint positions using Revolve schedule - let checkpoint_positions = revolve_schedule(num_steps, num_checkpoints); + // Compute optimal checkpoint positions using Revolve schedule. + // Note: the position set is O(num_steps) while checkpoint *states* are O(num_checkpoints). + let checkpoint_positions: HashSet = revolve_schedule(num_steps, num_checkpoints) + .into_iter() + .collect(); // -- Forward pass: run all steps, saving checkpoints -- let mut checkpoints: Vec<(usize, Vec)> = Vec::with_capacity(num_checkpoints + 1); @@ -153,7 +158,7 @@ pub fn grad_checkpointed_online( } // Thin when buffer is full. - if buffer.len() == num_checkpoints { + if buffer.len() >= num_checkpoints { // Keep buffer[0] (pinned). Among buffer[1..], keep even-indexed entries. let tail: Vec<(usize, Vec)> = buffer[1..].iter().step_by(2).cloned().collect(); buffer.truncate(1); @@ -244,7 +249,7 @@ pub fn grad_checkpointed_with_hints( let slot_alloc = largest_remainder_alloc(free, &interval_lengths, total_len); // Run Revolve on each sub-interval and merge all positions. - let mut all_positions: Vec = required.clone(); + let mut all_positions: HashSet = required.iter().copied().collect(); for (i, &(start, end)) in intervals.iter().enumerate() { let sub_steps = end - start; let sub_slots = slot_alloc[i]; @@ -254,10 +259,8 @@ pub fn grad_checkpointed_with_hints( all_positions.extend(sub_positions.iter().map(|&p| p + start)); } } - all_positions.sort_unstable(); - all_positions.dedup(); - // Forward pass using the merged position list. + // Forward pass using the merged position set. let mut checkpoints: Vec<(usize, Vec)> = Vec::with_capacity(all_positions.len() + 1); checkpoints.push((0, x0.to_vec())); @@ -370,7 +373,9 @@ pub fn grad_checkpointed_disk( let num_checkpoints = num_checkpoints.max(1).min(num_steps); // Compute optimal checkpoint positions using Revolve schedule. - let checkpoint_positions = revolve_schedule(num_steps, num_checkpoints); + let checkpoint_positions: HashSet = revolve_schedule(num_steps, num_checkpoints) + .into_iter() + .collect(); // Drop guard ensures cleanup even on panic. let mut guard = DiskCheckpointGuard { files: Vec::new() }; @@ -405,7 +410,7 @@ pub fn grad_checkpointed_disk( // Build checkpoint index: sorted list of (step, path) for reading back. // Step 0 is always first. let mut ckpt_steps: Vec = vec![0]; - ckpt_steps.extend(checkpoint_positions.iter().filter(|&&p| p < num_steps)); + ckpt_steps.extend(checkpoint_positions.iter().filter(|&p| *p < num_steps)); ckpt_steps.sort_unstable(); ckpt_steps.dedup(); diff --git a/src/cross_country.rs b/src/cross_country.rs index fa9c50b..83fa5ed 100644 --- a/src/cross_country.rs +++ b/src/cross_country.rs @@ -77,10 +77,18 @@ impl LinearizedGraph { op => { let [a_idx, b_idx] = arg_indices[i]; let a = values[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { + if op == OpCode::Powi { + let exp = opcode::powi_exp_decode_raw(b_idx); + let n = F::from(exp).unwrap(); + let da = n * a.powi(exp - 1); + if opcodes[a_idx as usize] != OpCode::Const && da != zero { + preds[i].push((a_idx, da)); + succs[a_idx as usize].push((i as u32, da)); + } + continue; + } + let b = if b_idx != UNUSED { values[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or(zero) } else { zero }; @@ -93,12 +101,8 @@ impl LinearizedGraph { succs[a_idx as usize].push((i as u32, da)); } - // Edge from second argument (binary ops only, not Powi) - if b_idx != UNUSED - && op != OpCode::Powi - && opcodes[b_idx as usize] != OpCode::Const - && db != zero - { + // Edge from second argument (binary ops only) + if b_idx != UNUSED && opcodes[b_idx as usize] != OpCode::Const && db != zero { preds[i].push((b_idx, db)); succs[b_idx as usize].push((i as u32, db)); } diff --git a/src/dual.rs b/src/dual.rs index 879aafb..b18f036 100644 --- a/src/dual.rs +++ b/src/dual.rs @@ -85,7 +85,7 @@ impl Dual { #[inline] pub fn powi(self, n: i32) -> Self { let val = self.re.powi(n); - let deriv = F::from(n).unwrap() * self.re.powi(n - 1); + let deriv = F::from(n).unwrap() * self.re.powf(F::from(n as i64 - 1).unwrap()); self.chain(val, deriv) } @@ -315,7 +315,11 @@ impl Dual { let h = self.re.hypot(other.re); Dual { re: h, - eps: (self.re * self.eps + other.re * other.eps) / h, + eps: if h == F::zero() { + F::zero() + } else { + (self.re * self.eps + other.re * other.eps) / h + }, } } diff --git a/src/dual_vec.rs b/src/dual_vec.rs index 4e6e74e..873d221 100644 --- a/src/dual_vec.rs +++ b/src/dual_vec.rs @@ -106,7 +106,7 @@ impl DualVec { #[inline] pub fn powi(self, n: i32) -> Self { let val = self.re.powi(n); - let deriv = F::from(n).unwrap() * self.re.powi(n - 1); + let deriv = F::from(n).unwrap() * self.re.powf(F::from(n as i64 - 1).unwrap()); self.chain(val, deriv) } @@ -335,7 +335,11 @@ impl DualVec { let h = self.re.hypot(other.re); DualVec { re: h, - eps: std::array::from_fn(|k| (self.re * self.eps[k] + other.re * other.eps[k]) / h), + eps: if h == F::zero() { + [F::zero(); N] + } else { + std::array::from_fn(|k| (self.re * self.eps[k] + other.re * other.eps[k]) / h) + }, } } diff --git a/src/laurent.rs b/src/laurent.rs index 41674fb..d47fe95 100644 --- a/src/laurent.rs +++ b/src/laurent.rs @@ -289,15 +289,17 @@ impl Laurent { pole_order: self.pole_order / 2, } } else if self.pole_order > 0 { - let tc = self.as_taylor_coeffs(); + if self.pole_order % 2 != 0 { + return Self::nan_laurent(); + } + // pole_order is even: sqrt of t^p * f(t) = t^(p/2) * sqrt(f(t)) + // where f(t) has coeffs[0] != 0 (due to normalization). let mut c = [F::zero(); K]; - taylor_ops::taylor_sqrt(&tc, &mut c); - let mut l = Laurent { + taylor_ops::taylor_sqrt(&self.coeffs, &mut c); + Laurent { coeffs: c, - pole_order: 0, - }; - l.normalize(); - l + pole_order: self.pole_order / 2, + } } else { let mut c = [F::zero(); K]; taylor_ops::taylor_sqrt(&self.coeffs, &mut c); @@ -326,17 +328,19 @@ impl Laurent { pole_order: self.pole_order / 3, } } else if self.pole_order > 0 { - let tc = self.as_taylor_coeffs(); + if self.pole_order % 3 != 0 { + return Self::nan_laurent(); + } + // pole_order divisible by 3: cbrt of t^p * f(t) = t^(p/3) * cbrt(f(t)) + // where f(t) has coeffs[0] != 0 (due to normalization). let mut c = [F::zero(); K]; let mut s1 = [F::zero(); K]; let mut s2 = [F::zero(); K]; - taylor_ops::taylor_cbrt(&tc, &mut c, &mut s1, &mut s2); - let mut l = Laurent { + taylor_ops::taylor_cbrt(&self.coeffs, &mut c, &mut s1, &mut s2); + Laurent { coeffs: c, - pole_order: 0, - }; - l.normalize(); - l + pole_order: self.pole_order / 3, + } } else { let mut c = [F::zero(); K]; let mut s1 = [F::zero(); K]; diff --git a/src/opcode.rs b/src/opcode.rs index 95291cd..07ffe10 100644 --- a/src/opcode.rs +++ b/src/opcode.rs @@ -276,7 +276,7 @@ pub fn reverse_partials(op: OpCode, a: T, b: T, r: T) -> (T, T) { let inv = one / b; (inv, -a * inv * inv) } - OpCode::Rem => (one, zero), + OpCode::Rem => (one, -(a / b).trunc()), OpCode::Powf => { // d/da a^b = b * a^(b-1) // d/db a^b = a^b * ln(a) @@ -382,6 +382,16 @@ fn powi_exp_decode(b: T) -> i32 { b.to_u32().unwrap_or(0) as i32 } +/// Decode a `powi` exponent directly from the raw `u32` in `arg_indices[1]`. +/// +/// Avoids the float round-trip of `powi_exp_decode`, which silently fails +/// for f32 when the u32 encoding exceeds 2^24 (any negative exponent). +#[inline] +#[must_use] +pub fn powi_exp_decode_raw(b_idx: u32) -> i32 { + b_idx as i32 +} + /// Encode a `powi` exponent as a value that can be stored in `arg_indices[1]`. #[inline] #[must_use] diff --git a/src/sparse.rs b/src/sparse.rs index cb8df5b..9df5ae3 100644 --- a/src/sparse.rs +++ b/src/sparse.rs @@ -120,6 +120,33 @@ pub(crate) fn detect_sparsity_impl( interactions.push((r, c)); } } + // For non-Mul binary ops (Div, Powf, Atan2, Hypot, Custom), + // within-operand second derivatives are nonzero. + // E.g. d^2(a/b)/db^2 = 2a/b^3, so variables flowing into + // the same operand interact. + // Mul is excluded: d^2(a*b)/da^2 = 0 (bilinear). + if op != OpCode::Mul { + for ii in 0..bits_a.len() { + for jj in 0..=ii { + let (r, c) = if bits_a[ii] >= bits_a[jj] { + (bits_a[ii], bits_a[jj]) + } else { + (bits_a[jj], bits_a[ii]) + }; + interactions.push((r, c)); + } + } + for ii in 0..bits_b.len() { + for jj in 0..=ii { + let (r, c) = if bits_b[ii] >= bits_b[jj] { + (bits_b[ii], bits_b[jj]) + } else { + (bits_b[jj], bits_b[ii]) + }; + interactions.push((r, c)); + } + } + } } OpClass::ZeroDerivative => { // Propagate dependencies for downstream ops diff --git a/src/stde/higher_order.rs b/src/stde/higher_order.rs index 3d7e96b..dd2e609 100644 --- a/src/stde/higher_order.rs +++ b/src/stde/higher_order.rs @@ -203,12 +203,13 @@ pub fn diagonal_kth_order_stochastic( let (mean, sample_variance, standard_error) = acc.finalize(); - // Unbiased estimator for Σ_j d_j: n * mean(sampled d_j's) + // Unbiased estimator for Σ_j d_j: n * mean(sampled d_j's). + // Scale variance/SE to match the rescaled estimate. EstimatorResult { value, estimate: mean * nf, - sample_variance, - standard_error, + sample_variance: sample_variance * nf * nf, + standard_error: standard_error * nf, num_samples: sampled_indices.len(), } } diff --git a/src/stde/pde.rs b/src/stde/pde.rs index e9bfc0d..e60554e 100644 --- a/src/stde/pde.rs +++ b/src/stde/pde.rs @@ -171,12 +171,14 @@ pub fn parabolic_diffusion_stochastic( let (mean, sample_variance, standard_error) = acc.finalize(); - // Unbiased estimator for ½ Σ_i (σ·e_i)^T H (σ·e_i): d * mean * ½ + // Unbiased estimator for ½ Σ_i (σ·e_i)^T H (σ·e_i): d * mean * ½. + // Scale variance/SE to match the rescaled estimate. + let scale = df * half; EstimatorResult { value, - estimate: mean * df * half, - sample_variance, - standard_error, + estimate: mean * scale, + sample_variance: sample_variance * scale * scale, + standard_error: standard_error * scale, num_samples: sampled_indices.len(), } } diff --git a/src/taylor_ops.rs b/src/taylor_ops.rs index 21f89f7..3baa212 100644 --- a/src/taylor_ops.rs +++ b/src/taylor_ops.rs @@ -442,12 +442,14 @@ pub fn taylor_powf( /// `c = a^n` (powi) — integer power. /// -/// Uses `scratch1` for `ln(a)` and `scratch2` for `n * ln(a)`. +/// Dispatches between two strategies: +/// - **Repeated squaring** (binary exponentiation via `taylor_mul`): used when +/// `a[0] < 0` (where `ln` would produce NaN) or `|n| <= 8` (at most 3 +/// multiplications, competitive with exp-ln). +/// - **exp(n * ln(a))**: used for positive base with large exponents. #[inline] pub fn taylor_powi(a: &[F], n: i32, c: &mut [F], scratch1: &mut [F], scratch2: &mut [F]) { let deg = c.len(); - // For small integer powers, direct computation may be more efficient, - // but for generality use exp(n * ln(a)). if n == 0 { c[0] = F::one(); for ck in c[1..deg].iter_mut() { @@ -459,14 +461,76 @@ pub fn taylor_powi(a: &[F], n: i32, c: &mut [F], scratch1: &mut [F], s c.copy_from_slice(a); return; } - // scratch1 = ln(a) - taylor_ln(a, scratch1); - // scratch2 = n * ln(a) - let nf = F::from(n).unwrap(); - taylor_scale(scratch1, nf, scratch2); - // c = exp(n * ln(a)) - taylor_exp(scratch2, c); - c[0] = a[0].powi(n); + if n == -1 { + taylor_recip(a, c); + return; + } + if a[0] < F::zero() || n.unsigned_abs() <= 8 { + taylor_powi_squaring(a, n, c, scratch1, scratch2); + } else { + // scratch1 = ln(a) + taylor_ln(a, scratch1); + // scratch2 = n * ln(a) + let nf = F::from(n).unwrap(); + taylor_scale(scratch1, nf, scratch2); + // c = exp(n * ln(a)) + taylor_exp(scratch2, c); + c[0] = a[0].powi(n); + } +} + +/// Integer power via binary exponentiation on Taylor coefficient arrays. +/// +/// Computes `a^n` using repeated squaring with `taylor_mul`. Works correctly +/// for negative base values (unlike the exp-ln path). For negative `n`, +/// computes `a^|n|` then takes the reciprocal. +fn taylor_powi_squaring( + a: &[F], + n: i32, + c: &mut [F], + scratch1: &mut [F], + scratch2: &mut [F], +) { + let deg = c.len(); + let abs_n = n.unsigned_abs(); + + // result (c) = 1 + c[0] = F::one(); + for ck in c[1..deg].iter_mut() { + *ck = F::zero(); + } + + // base (scratch1) = a + scratch1[..deg].copy_from_slice(&a[..deg]); + + let mut power = abs_n; + while power > 0 { + if power & 1 == 1 { + // result = result * base + taylor_mul(c, &*scratch1, scratch2); + c[..deg].copy_from_slice(&scratch2[..deg]); + } + power >>= 1; + if power > 0 { + // base = base * base + let base_ref: &[F] = &*scratch1; + // Inline squaring to avoid borrow conflict (scratch1 is both source and dest) + for k in 0..deg { + let mut sum = F::zero(); + for j in 0..=k { + sum = sum + base_ref[j] * base_ref[k - j]; + } + scratch2[k] = sum; + } + scratch1[..deg].copy_from_slice(&scratch2[..deg]); + } + } + + if n < 0 { + // c = 1/c: copy c into scratch1, then compute recip into c + scratch1[..deg].copy_from_slice(&c[..deg]); + taylor_recip(scratch1, c); + } } /// `c = cbrt(a) = a^(1/3)`. diff --git a/src/traits/breverse_ops.rs b/src/traits/breverse_ops.rs index 0343063..ea67625 100644 --- a/src/traits/breverse_ops.rs +++ b/src/traits/breverse_ops.rs @@ -265,11 +265,11 @@ macro_rules! impl_breverse_scalar_ops { #[inline] fn rem(self, rhs: BReverse<$f>) -> BReverse<$f> { let value = self % rhs.value; - // Constant % variable has zero derivative — treat as constant. - BReverse { - value, - index: CONSTANT, - } + let index = bytecode_tape::with_active_btape(|t| { + let c = t.push_const(self); + t.push_op(OpCode::Rem, c, rhs.index, value) + }); + BReverse { value, index } } } }; diff --git a/src/traits/dual_vec_ops.rs b/src/traits/dual_vec_ops.rs index 2f1f702..ca72d00 100644 --- a/src/traits/dual_vec_ops.rs +++ b/src/traits/dual_vec_ops.rs @@ -67,9 +67,10 @@ impl Rem for DualVec { type Output = Self; #[inline] fn rem(self, rhs: Self) -> Self { + let q = (self.re / rhs.re).trunc(); DualVec { re: self.re % rhs.re, - eps: self.eps, + eps: std::array::from_fn(|k| self.eps[k] - rhs.eps[k] * q), } } } @@ -217,9 +218,10 @@ macro_rules! impl_dual_vec_scalar_ops { type Output = DualVec<$f, N>; #[inline] fn rem(self, rhs: DualVec<$f, N>) -> DualVec<$f, N> { + let q = (self / rhs.re).trunc(); DualVec { re: self % rhs.re, - eps: [<$f>::from(0.0); N], + eps: std::array::from_fn(|k| -rhs.eps[k] * q), } } } diff --git a/src/traits/num_traits_impls.rs b/src/traits/num_traits_impls.rs index 9a7347a..f692eff 100644 --- a/src/traits/num_traits_impls.rs +++ b/src/traits/num_traits_impls.rs @@ -644,7 +644,7 @@ impl NumFloat for Reverse { fn powi(self, n: i32) -> Self { let val = self.value.powi(n); - let deriv = F::from(n).unwrap() * self.value.powi(n - 1); + let deriv = F::from(n).unwrap() * self.value.powf(F::from(n as i64 - 1).unwrap()); rev_unary(self, val, deriv) } @@ -793,8 +793,11 @@ impl NumFloat for Reverse { fn hypot(self, other: Self) -> Self { let h = self.value.hypot(other.value); - let dx = self.value / h; - let dy = other.value / h; + let (dx, dy) = if h == F::zero() { + (F::zero(), F::zero()) + } else { + (self.value / h, other.value / h) + }; rev_binary(self, other, h, dx, dy) } diff --git a/src/traits/std_ops.rs b/src/traits/std_ops.rs index 71ddb39..e6a6897 100644 --- a/src/traits/std_ops.rs +++ b/src/traits/std_ops.rs @@ -5,7 +5,7 @@ use std::ops::{ use crate::dual::Dual; use crate::float::Float; use crate::reverse::Reverse; -use crate::tape::{self, TapeThreadLocal, CONSTANT}; +use crate::tape::{self, TapeThreadLocal}; // ────────────────────────────────────────────── // Dual operators @@ -73,7 +73,7 @@ impl Rem for Dual { fn rem(self, rhs: Self) -> Self { Dual { re: self.re % rhs.re, - eps: self.eps, + eps: self.eps - rhs.eps * (self.re / rhs.re).trunc(), } } } @@ -222,9 +222,10 @@ macro_rules! impl_dual_scalar_ops { type Output = Dual<$f>; #[inline] fn rem(self, rhs: Dual<$f>) -> Dual<$f> { + let q = (self / rhs.re).trunc(); Dual { re: self % rhs.re, - eps: <$f>::from(0.0), + eps: -rhs.eps * q, } } } @@ -313,8 +314,8 @@ impl Rem for Reverse { #[inline] fn rem(self, rhs: Self) -> Self { let value = self.value % rhs.value; - let index = - tape::with_active_tape(|t| t.push_binary(self.index, F::one(), rhs.index, F::zero())); + let q = (self.value / rhs.value).trunc(); + let index = tape::with_active_tape(|t| t.push_binary(self.index, F::one(), rhs.index, -q)); Reverse { value, index } } } @@ -454,11 +455,9 @@ macro_rules! impl_reverse_scalar_ops { #[inline] fn rem(self, rhs: Reverse<$f>) -> Reverse<$f> { let value = self % rhs.value; - // Constant remainder w.r.t. denominator has zero derivative. - Reverse { - value, - index: CONSTANT, - } + let q = (self / rhs.value).trunc(); + let index = tape::with_active_tape(|t| t.push_unary(rhs.index, -q)); + Reverse { value, index } } } }; diff --git a/src/traits/taylor_std_ops.rs b/src/traits/taylor_std_ops.rs index bcf2892..c399100 100644 --- a/src/traits/taylor_std_ops.rs +++ b/src/traits/taylor_std_ops.rs @@ -73,13 +73,15 @@ impl Neg for Taylor { impl Rem for Taylor { type Output = Self; #[inline] + #[allow(clippy::suspicious_arithmetic_impl)] fn rem(self, rhs: Self) -> Self { + let q = (self.coeffs[0] / rhs.coeffs[0]).trunc(); Taylor { coeffs: std::array::from_fn(|k| { if k == 0 { self.coeffs[0] % rhs.coeffs[0] } else { - self.coeffs[k] + self.coeffs[k] - rhs.coeffs[k] * q } }), } @@ -311,7 +313,10 @@ impl Rem for TaylorDyn { fn rem(self, rhs: Self) -> Self { TaylorDyn::binary_op(&self, &rhs, |a, b, c| { c[0] = a[0] % b[0]; - c[1..].copy_from_slice(&a[1..]); + let q = (a[0] / b[0]).trunc(); + for k in 1..c.len() { + c[k] = a[k] - b[k] * q; + } }) } } diff --git a/tests/nonsmooth.rs b/tests/nonsmooth.rs index c5459ef..586d39d 100644 --- a/tests/nonsmooth.rs +++ b/tests/nonsmooth.rs @@ -280,14 +280,33 @@ fn forward_nonsmooth_detects_ceil_kink() { #[test] fn forward_nonsmooth_detects_round_trunc() { - // round(x) and trunc(x) near integers → kinks detected + // Trunc has kinks at integers, Round has kinks at half-integers. + // Use x = 4.001 (near integer 4) for trunc, x = 3.501 (near half-integer 3.5) for round. + + // Test trunc near integer + let (mut tape, _) = record(|x| x[0].trunc(), &[4.001]); + let info = tape.forward_nonsmooth(&[4.001]); + assert_eq!(info.kinks.len(), 1); + assert_eq!(info.kinks[0].opcode, OpCode::Trunc); + assert_eq!(info.active_kinks(0.01).len(), 1); + + // Test round near half-integer + let (mut tape, _) = record(|x| x[0].round(), &[3.501]); + let info = tape.forward_nonsmooth(&[3.501]); + assert_eq!(info.kinks.len(), 1); + assert_eq!(info.kinks[0].opcode, OpCode::Round); + assert_eq!(info.active_kinks(0.01).len(), 1); + + // Both in one expression: use a value near both a half-integer and an integer + // is impossible (they're 0.5 apart), so test that both kinks are detected + // even when only one is active. let (mut tape, _) = record(|x| x[0].round() + x[0].trunc(), &[4.001]); let info = tape.forward_nonsmooth(&[4.001]); assert_eq!(info.kinks.len(), 2); assert_eq!(info.kinks[0].opcode, OpCode::Round); assert_eq!(info.kinks[1].opcode, OpCode::Trunc); - // Both near integer → active - assert_eq!(info.active_kinks(0.01).len(), 2); + // Only trunc is near its kink (integer); round's kink is at half-integers + assert_eq!(info.active_kinks(0.01).len(), 1); } #[test] diff --git a/tests/opcode_edge_cases.rs b/tests/opcode_edge_cases.rs index 10a66fd..27a4e36 100644 --- a/tests/opcode_edge_cases.rs +++ b/tests/opcode_edge_cases.rs @@ -1,8 +1,10 @@ #![cfg(feature = "bytecode")] use echidna::opcode::{ - eval_forward, forced_reverse_partials, powi_exp_encode, reverse_partials, OpCode, + eval_forward, forced_reverse_partials, powi_exp_decode_raw, powi_exp_encode, reverse_partials, + OpCode, }; +use num_traits::Float; const TOL: f64 = 1e-10; @@ -406,3 +408,59 @@ fn powi_encode_zero() { fn powi_encode_negative() { assert_eq!(powi_exp_encode(-1), u32::MAX); } + +// ══════════════════════════════════════════════ +// powi_exp_decode_raw round-trip +// ══════════════════════════════════════════════ + +#[test] +fn powi_decode_raw_positive() { + assert_eq!(powi_exp_decode_raw(powi_exp_encode(5)), 5); +} + +#[test] +fn powi_decode_raw_zero() { + assert_eq!(powi_exp_decode_raw(powi_exp_encode(0)), 0); +} + +#[test] +fn powi_decode_raw_negative() { + assert_eq!(powi_exp_decode_raw(powi_exp_encode(-1)), -1); + assert_eq!(powi_exp_decode_raw(powi_exp_encode(-2)), -2); + assert_eq!(powi_exp_decode_raw(powi_exp_encode(-10)), -10); + assert_eq!(powi_exp_decode_raw(powi_exp_encode(i32::MIN)), i32::MIN); +} + +// ══════════════════════════════════════════════ +// f32 powi tape: value + gradient (regression for the f32 exponent encoding bug) +// ══════════════════════════════════════════════ + +#[test] +fn f32_tape_powi_negative_2() { + // x^(-2) at x=2: value=0.25, gradient=-2*2^(-3)=-0.25 + let (mut tape, _) = echidna::record(|x: &[echidna::BReverse]| x[0].powi(-2), &[2.0f32]); + let grad = tape.gradient(&[2.0f32]); + assert!((tape.values_slice()[tape.output_index()] - 0.25f32).abs() < 1e-6); + assert!((grad[0] - (-0.25f32)).abs() < 1e-6); +} + +#[test] +fn f32_tape_powi_negative_10() { + // x^(-10) at x=2: value=1/1024, gradient=-10*2^(-11)=-10/2048 + let expected_val: f32 = 2.0f32.powi(-10); + let expected_grad: f32 = -10.0 * 2.0f32.powi(-11); + let (mut tape, _) = echidna::record(|x: &[echidna::BReverse]| x[0].powi(-10), &[2.0f32]); + let grad = tape.gradient(&[2.0f32]); + assert!( + (tape.values_slice()[tape.output_index()] - expected_val).abs() < 1e-8, + "value: got {}, expected {}", + tape.values_slice()[tape.output_index()], + expected_val + ); + assert!( + (grad[0] - expected_grad).abs() < 1e-8, + "grad: got {}, expected {}", + grad[0], + expected_grad + ); +} diff --git a/tests/taylor.rs b/tests/taylor.rs index 11593e5..25a8eea 100644 --- a/tests/taylor.rs +++ b/tests/taylor.rs @@ -808,3 +808,66 @@ fn taylor_dyn_through_scalar_generic() { let result = ad_generic_rosenbrock(x, y); assert_relative_eq!(result.value(), 0.0, epsilon = 1e-10); } + +// ══════════════════════════════════════════════ +// powi with negative base (regression test) +// ══════════════════════════════════════════════ + +#[test] +fn taylor_powi_negative_base_squared() { + // f(t) = (-2 + t)^2 = 4 - 4t + t^2 + // Coefficients: [4, -4, 1, 0] + let x = Taylor::::variable(-2.0); + let result = x.powi(2); + assert_relative_eq!(result.coeffs[0], 4.0, epsilon = 1e-12); + assert_relative_eq!(result.coeffs[1], -4.0, epsilon = 1e-12); + assert_relative_eq!(result.coeffs[2], 1.0, epsilon = 1e-12); + assert_relative_eq!(result.coeffs[3], 0.0, epsilon = 1e-12); +} + +#[test] +fn taylor_powi_negative_base_cubed() { + // f(t) = (-2 + t)^3 = -8 + 12t - 6t^2 + t^3 + // Coefficients: [-8, 12, -6, 1] + let x = Taylor::::variable(-2.0); + let result = x.powi(3); + assert_relative_eq!(result.coeffs[0], -8.0, epsilon = 1e-10); + assert_relative_eq!(result.coeffs[1], 12.0, epsilon = 1e-10); + assert_relative_eq!(result.coeffs[2], -6.0, epsilon = 1e-10); + assert_relative_eq!(result.coeffs[3], 1.0, epsilon = 1e-10); +} + +#[test] +fn taylor_powi_negative_base_fourth_power() { + // f(t) = (-3 + t)^4 = 81 - 108t + 54t^2 - 12t^3 + t^4 + let x = Taylor::::variable(-3.0); + let result = x.powi(4); + assert_relative_eq!(result.coeffs[0], 81.0, epsilon = 1e-10); + assert_relative_eq!(result.coeffs[1], -108.0, epsilon = 1e-10); + assert_relative_eq!(result.coeffs[2], 54.0, epsilon = 1e-10); + assert_relative_eq!(result.coeffs[3], -12.0, epsilon = 1e-10); + assert_relative_eq!(result.coeffs[4], 1.0, epsilon = 1e-10); +} + +#[test] +fn taylor_powi_negative_base_negative_exponent() { + // f(t) = (-2 + t)^(-1) at t=0: + // f(0) = -0.5, f'(0) = -(-2)^(-2) = -0.25, f''(0)/2 = -(-2)^(-3) = -0.125 + let x = Taylor::::variable(-2.0); + let result = x.powi(-1); + assert_relative_eq!(result.coeffs[0], -0.5, epsilon = 1e-12); + assert_relative_eq!(result.coeffs[1], -0.25, epsilon = 1e-12); + assert_relative_eq!(result.coeffs[2], -0.125, epsilon = 1e-12); + assert_relative_eq!(result.coeffs[3], -0.0625, epsilon = 1e-12); +} + +#[test] +fn taylor_powi_positive_base_still_works() { + // Regression: positive base should still use the efficient exp-ln path for large exponents. + // f(t) = (2 + t)^10 at t=0: c[0] = 1024 + let x = Taylor::::variable(2.0); + let result = x.powi(10); + assert_relative_eq!(result.coeffs[0], 1024.0, epsilon = 1e-8); + // f'(0) = 10 * 2^9 = 5120 + assert_relative_eq!(result.coeffs[1], 5120.0, epsilon = 1e-6); +}