Entrolution · gvonness-apolitical · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.4.1] - 2026-03-14
+
+### Fixed
+
+- **Powi f32 exponent encoding**: `powi(n)` on f32 bytecode tapes silently produced wrong values and gradients for negative exponents (`n <= -2`). The `i32` exponent was stored as `u32` then round-tripped through `f32`, which loses precision for values > 2^24 (all negative exponents). All 5 dispatch sites (forward, reverse, tangent forward, tangent reverse, cross-country) now decode the exponent directly from the raw `u32` via `powi_exp_decode_raw`, bypassing the float conversion entirely.
+- **taylor_powi negative base**: `Taylor::powi` and bytecode Taylor-mode produced NaN for negative base values (e.g. `(-2)^3`) because the implementation used `exp(n * ln(a))` which fails for `ln(negative)`. Added `taylor_powi_squaring` using binary exponentiation with `taylor_mul`, dispatched when `a[0] < 0` or `|n| <= 8`.
+- **Checkpoint position lookup**: `grad_checkpointed`, `grad_checkpointed_disk`, and `grad_checkpointed_with_hints` used `Vec::contains()` for checkpoint position lookups (O(n) per step). Converted to `HashSet` for O(1) lookups.
+- **Nonsmooth Round kink detection**: `forward_nonsmooth` now correctly detects Round kinks at half-integers (0.5, 1.5, ...) instead of at integers, matching the actual discontinuity locations of the `round` function. Updated test to match.
+
 ## [0.4.0] - 2026-02-26
 
 ### Changed
@@ -175,7 +184,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Forward-vs-reverse cross-validation on Rosenbrock, Beale, Ackley, Booth, and more
 - Criterion benchmarks for forward overhead and reverse gradient
 
-[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.4.0...HEAD
+[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.4.1...HEAD
+[0.4.1]: https://github.com/Entrolution/echidna/compare/v0.4.0...v0.4.1
 [0.4.0]: https://github.com/Entrolution/echidna/compare/v0.3.0...v0.4.0
 [0.3.0]: https://github.com/Entrolution/echidna/compare/v0.2.0...v0.3.0
 [0.2.0]: https://github.com/Entrolution/echidna/compare/v0.1.0...v0.2.0

@@ -3,7 +3,7 @@ members = [".", "echidna-optim"]
 
 [package]
 name = "echidna"
-version = "0.4.0"
+version = "0.4.1"
 edition = "2021"
 rust-version = "1.93"
 description = "A high-performance automatic differentiation library for Rust"

@@ -4,10 +4,10 @@
 
 | Version | Supported |
 |---------|-----------|
-| 0.4.x   | Yes       |
-| < 0.4   | No        |
+| >= 0.4.1 | Yes       |
+| < 0.4.1  | No        |
 
-Only the latest minor release receives security updates.
+Only the latest patch release receives security updates. Versions prior to 0.4.1 have known correctness bugs (silent wrong results from `powi` on f32 tapes, NaN from `Taylor::powi` with negative base).
 
 ## Reporting a Vulnerability
 

@@ -1,6 +1,6 @@
 [package]
 name = "echidna-optim"
-version = "0.4.0"
+version = "0.4.1"
 edition = "2021"
 rust-version = "1.93"
 description = "Optimization solvers and implicit differentiation for echidna"
@@ -12,7 +12,7 @@ keywords = ["optimization", "differentiation", "implicit", "gradient", "scientif
 categories = ["mathematics", "science", "algorithms"]
 
 [dependencies]
-echidna = { version = "0.4.0", path = "..", features = ["bytecode"] }
+echidna = { version = "0.4.1", path = "..", features = ["bytecode"] }
 num-traits = "0.2"
 faer = { version = "0.24", optional = true }
 

@@ -51,9 +51,9 @@ pub fn lbfgs<F: Float, O: Objective<F>>(
     if config.memory == 0 || config.convergence.max_iter == 0 {
         return OptimResult {
             x: x0.to_vec(),
-            value: F::zero(),
-            gradient: vec![F::zero(); n],
-            gradient_norm: F::zero(),
+            value: F::nan(),
+            gradient: vec![F::nan(); n],
+            gradient_norm: F::nan(),
             iterations: 0,
             func_evals: 0,
             termination: TerminationReason::NumericalError,

@@ -49,9 +49,9 @@ pub fn newton<F: Float, O: Objective<F>>(
     if config.convergence.max_iter == 0 {
         return OptimResult {
             x: x0.to_vec(),
-            value: F::zero(),
-            gradient: vec![F::zero(); n],
-            gradient_norm: F::zero(),
+            value: F::nan(),
+            gradient: vec![F::nan(); n],
+            gradient_norm: F::nan(),
             iterations: 0,
             func_evals: 0,
             termination: TerminationReason::NumericalError,

@@ -62,9 +62,9 @@ pub fn trust_region<F: Float, O: Objective<F>>(
     {
         return OptimResult {
             x: x0.to_vec(),
-            value: F::zero(),
-            gradient: vec![F::zero(); n],
-            gradient_norm: F::zero(),
+            value: F::nan(),
+            gradient: vec![F::nan(); n],
+            gradient_norm: F::nan(),
             iterations: 0,
             func_evals: 0,
             termination: TerminationReason::NumericalError,

@@ -43,6 +43,12 @@ pub fn grad<F: Float + TapeThreadLocal>(
     let output = f(&inputs);
     drop(guard);
 
+    // If the output is a constant (independent of all inputs), the gradient is zero.
+    if output.index == crate::tape::CONSTANT {
+        Tape::return_to_pool(tape);
+        return vec![F::zero(); n];
+    }
+
     // Run reverse sweep.
     let adjoints = tape.reverse(output.index);
 

@@ -30,10 +30,13 @@ fn forward_dispatch<F: Float>(
             op => {
                 let [a_idx, b_idx] = arg_indices[i];
                 let a = values[a_idx as usize];
-                let b = if b_idx != UNUSED && op != OpCode::Powi {
+                if op == OpCode::Powi {
+                    let exp = opcode::powi_exp_decode_raw(b_idx);
+                    values[i] = a.powi(exp);
+                    continue;
+                }
+                let b = if b_idx != UNUSED {
                     values[b_idx as usize]
-                } else if op == OpCode::Powi {
-                    F::from(b_idx).unwrap_or_else(|| F::zero())
                 } else {
                     F::zero()
                 };
@@ -128,7 +131,7 @@ impl<F: Float> super::BytecodeTape<F> {
                         branch: if a >= F::zero() { 1 } else { -1 },
                     });
                 }
-                OpCode::Floor | OpCode::Ceil | OpCode::Round | OpCode::Trunc => {
+                OpCode::Floor | OpCode::Ceil | OpCode::Trunc => {
                     // Kink at integer values. switching_value = distance to
                     // nearest integer: zero exactly at kink points, works
                     // symmetrically for both approach directions.
@@ -143,6 +146,19 @@ impl<F: Float> super::BytecodeTape<F> {
                         },
                     });
                 }
+                OpCode::Round => {
+                    // Round has kinks at half-integers (0.5, 1.5, ...),
+                    // not at integers. Shift by 0.5 to measure distance
+                    // to the nearest half-integer.
+                    let half = F::from(0.5).unwrap();
+                    let shifted = a + half;
+                    kinks.push(crate::nonsmooth::KinkEntry {
+                        tape_index: i as u32,
+                        opcode: op,
+                        switching_value: shifted - shifted.round(),
+                        branch: if a - a.floor() < half { 1 } else { -1 },
+                    });
+                }
                 _ => unreachable!(),
             }
         }

@@ -34,7 +34,13 @@ impl<F: Float> super::BytecodeTape<F> {
             if a != UNUSED {
                 stack.push(a);
             }
-            if b != UNUSED && self.opcodes[i] != OpCode::Powi {
+            if self.opcodes[i] == OpCode::Custom {
+                // For Custom ops, b is a callback index, not a tape index.
+                // The actual second operand (for binary custom ops) is in custom_second_args.
+                if let Some(&second_arg) = self.custom_second_args.get(&(idx)) {
+                    stack.push(second_arg);
+                }
+            } else if b != UNUSED && self.opcodes[i] != OpCode::Powi {
                 stack.push(b);
             }
         }
@@ -62,10 +68,13 @@ impl<F: Float> super::BytecodeTape<F> {
                 } else {
                     UNUSED
                 };
-                let rb = if b != UNUSED && self.opcodes[read] != OpCode::Powi {
+                let rb = if b != UNUSED
+                    && self.opcodes[read] != OpCode::Powi
+                    && self.opcodes[read] != OpCode::Custom
+                {
                     remap[b as usize]
                 } else {
-                    b
+                    b // Powi: b is encoded exponent; Custom: b is callback index
                 };
                 self.arg_indices[write] = [ra, rb];
                 write += 1;
@@ -77,6 +86,16 @@ impl<F: Float> super::BytecodeTape<F> {
         self.values.truncate(new_len);
         self.num_variables = new_len as u32;
 
+        // Remap custom_second_args: both keys and values are tape indices.
+        if !self.custom_second_args.is_empty() {
+            self.custom_second_args = self
+                .custom_second_args
+                .iter()
+                .filter(|(&k, _)| reachable[k as usize])
+                .map(|(&k, &v)| (remap[k as usize], remap[v as usize]))
+                .collect();
+        }
+
         remap
     }
 
@@ -97,9 +116,6 @@ impl<F: Float> super::BytecodeTape<F> {
     /// `output_indices` contains only the active outputs (remapped), and
     /// `output_index` is set to the first active output.
     ///
-    /// **Note**: Like `dead_code_elimination`, this does not remap
-    /// `custom_second_args` (pre-existing limitation).
-    ///
     /// # Panics
     /// Panics if `active_outputs` is empty.
     pub fn dead_code_elimination_for_outputs(&mut self, active_outputs: &[u32]) {
@@ -142,14 +158,20 @@ impl<F: Float> super::BytecodeTape<F> {
             }
 
             let [mut a, mut b] = self.arg_indices[i];
-            // Apply remap to args (except Powi exponent in b).
+            // Apply remap to args (except Powi exponent and Custom callback in b).
             a = remap[a as usize];
-            if b != UNUSED && op != OpCode::Powi {
+            if b != UNUSED && op != OpCode::Powi && op != OpCode::Custom {
                 b = remap[b as usize];
             }
             // Update arg_indices with remapped values.
             self.arg_indices[i] = [a, b];
 
+            // Skip CSE for Custom ops: the key doesn't capture custom_second_args,
+            // so binary custom ops with different second operands would be incorrectly merged.
+            if op == OpCode::Custom {
+                continue;
+            }
+
             // Build the canonical key.
             let key = if b == UNUSED {
                 // Unary: hash (op, arg0) only; use UNUSED as placeholder.
@@ -176,14 +198,22 @@ impl<F: Float> super::BytecodeTape<F> {
             }
             let [a, b] = self.arg_indices[i];
             let ra = remap[a as usize];
-            let rb = if b != UNUSED && op != OpCode::Powi {
+            let rb = if b != UNUSED && op != OpCode::Powi && op != OpCode::Custom {
                 remap[b as usize]
             } else {
-                b
+                b // Powi: b is encoded exponent; Custom: b is callback index
             };
             self.arg_indices[i] = [ra, rb];
         }
 
+        // Remap custom_second_args values (keys are not changed by CSE remap,
+        // but the second operand indices may have been CSE'd).
+        if !self.custom_second_args.is_empty() {
+            for v in self.custom_second_args.values_mut() {
+                *v = remap[*v as usize];
+            }
+        }
+
         // Update output indices.
         self.output_index = remap[self.output_index as usize];
         for oi in &mut self.output_indices {
@@ -198,8 +228,7 @@ impl<F: Float> super::BytecodeTape<F> {
     ///
     /// In debug builds, validates internal consistency after optimization.
     pub fn optimize(&mut self) {
-        self.cse();
-        self.dead_code_elimination();
+        self.cse(); // CSE already calls dead_code_elimination() internally.
 
         // Validate internal consistency in debug builds.
         #[cfg(debug_assertions)]

@@ -59,10 +59,15 @@ impl<F: Float> super::BytecodeTape<F> {
                     adjoints[i] = F::zero();
                     let [a_idx, b_idx] = self.arg_indices[i];
                     let a = values[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                    if op == OpCode::Powi {
+                        let exp = opcode::powi_exp_decode_raw(b_idx);
+                        let n = F::from(exp).unwrap();
+                        let da = n * a.powi(exp - 1);
+                        adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
+                        continue;
+                    }
+                    let b = if b_idx != UNUSED {
                         values[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        F::from(b_idx).unwrap_or_else(|| F::zero())
                     } else {
                         F::zero()
                     };
@@ -74,7 +79,7 @@ impl<F: Float> super::BytecodeTape<F> {
                     };
 
                     adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
-                    if b_idx != UNUSED && op != OpCode::Powi {
+                    if b_idx != UNUSED {
                         adjoints[b_idx as usize] = adjoints[b_idx as usize] + db * adj;
                     }
                 }

@@ -80,10 +80,13 @@ impl<F: Float> super::BytecodeTape<F> {
                 op => {
                     let [a_idx, b_idx] = self.arg_indices[i];
                     let a = buf[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                    if op == OpCode::Powi {
+                        let exp = opcode::powi_exp_decode_raw(b_idx);
+                        buf[i] = a.powi(exp);
+                        continue;
+                    }
+                    let b = if b_idx != UNUSED {
                         buf[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        T::from(b_idx).unwrap_or_else(|| T::zero())
                     } else {
                         T::zero()
                     };
@@ -170,18 +173,23 @@ impl<F: Float> super::BytecodeTape<F> {
 
                     let [a_idx, b_idx] = self.arg_indices[i];
                     let a = tangent_vals[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                    if op == OpCode::Powi {
+                        let exp = opcode::powi_exp_decode_raw(b_idx);
+                        let n = T::from(exp).unwrap();
+                        let da = n * a.powi(exp - 1);
+                        buf[a_idx as usize] = buf[a_idx as usize] + da * adj;
+                        continue;
+                    }
+                    let b = if b_idx != UNUSED {
                         tangent_vals[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        T::from(b_idx).unwrap_or_else(|| T::zero())
                     } else {
                         T::zero()
                     };
                     let r = tangent_vals[i];
                     let (da, db) = opcode::reverse_partials(op, a, b, r);
 
                     buf[a_idx as usize] = buf[a_idx as usize] + da * adj;
-                    if b_idx != UNUSED && op != OpCode::Powi {
+                    if b_idx != UNUSED {
                         buf[b_idx as usize] = buf[b_idx as usize] + db * adj;
                     }
                 }