diff --git a/CHANGELOG.md b/CHANGELOG.md
index 054d493..7f881fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.4.1] - 2026-03-14
+
+### Fixed
+
+- **Powi f32 exponent encoding**: `powi(n)` on f32 bytecode tapes silently produced wrong values and gradients for negative exponents (`n <= -2`). The `i32` exponent was stored as `u32` then round-tripped through `f32`, which loses precision for values > 2^24 (all negative exponents). All 5 dispatch sites (forward, reverse, tangent forward, tangent reverse, cross-country) now decode the exponent directly from the raw `u32` via `powi_exp_decode_raw`, bypassing the float conversion entirely.
+- **taylor_powi negative base**: `Taylor::powi` and bytecode Taylor-mode produced NaN for negative base values (e.g. `(-2)^3`) because the implementation used `exp(n * ln(a))` which fails for `ln(negative)`. Added `taylor_powi_squaring` using binary exponentiation with `taylor_mul`, dispatched when `a[0] < 0` or `|n| <= 8`.
+- **Checkpoint position lookup**: `grad_checkpointed`, `grad_checkpointed_disk`, and `grad_checkpointed_with_hints` used `Vec::contains()` for checkpoint position lookups (O(n) per step). Converted to `HashSet` for O(1) lookups.
+- **Nonsmooth Round kink detection**: `forward_nonsmooth` now correctly detects Round kinks at half-integers (0.5, 1.5, ...) instead of at integers, matching the actual discontinuity locations of the `round` function. Updated test to match.
+
 ## [0.4.0] - 2026-02-26
 
 ### Changed
@@ -175,7 +184,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Forward-vs-reverse cross-validation on Rosenbrock, Beale, Ackley, Booth, and more
 - Criterion benchmarks for forward overhead and reverse gradient
 
-[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.4.0...HEAD
+[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.4.1...HEAD
+[0.4.1]: https://github.com/Entrolution/echidna/compare/v0.4.0...v0.4.1
 [0.4.0]: https://github.com/Entrolution/echidna/compare/v0.3.0...v0.4.0
 [0.3.0]: https://github.com/Entrolution/echidna/compare/v0.2.0...v0.3.0
 [0.2.0]: https://github.com/Entrolution/echidna/compare/v0.1.0...v0.2.0
diff --git a/Cargo.toml b/Cargo.toml
index eaf5666..5dd77b2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ members = [".", "echidna-optim"]
 
 [package]
 name = "echidna"
-version = "0.4.0"
+version = "0.4.1"
 edition = "2021"
 rust-version = "1.93"
 description = "A high-performance automatic differentiation library for Rust"
diff --git a/SECURITY.md b/SECURITY.md
index 63c0c2f..f48d129 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -4,10 +4,10 @@
 
 | Version | Supported |
 |---------|-----------|
-| 0.4.x   | Yes       |
-| < 0.4   | No        |
+| >= 0.4.1 | Yes       |
+| < 0.4.1  | No        |
 
-Only the latest minor release receives security updates.
+Only the latest patch release receives security updates. Versions prior to 0.4.1 have known correctness bugs (silent wrong results from `powi` on f32 tapes, NaN from `Taylor::powi` with negative base).
 
 ## Reporting a Vulnerability
 
diff --git a/echidna-optim/Cargo.toml b/echidna-optim/Cargo.toml
index 267f5ba..f458803 100644
--- a/echidna-optim/Cargo.toml
+++ b/echidna-optim/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "echidna-optim"
-version = "0.4.0"
+version = "0.4.1"
 edition = "2021"
 rust-version = "1.93"
 description = "Optimization solvers and implicit differentiation for echidna"
@@ -12,7 +12,7 @@ keywords = ["optimization", "differentiation", "implicit", "gradient", "scientif
 categories = ["mathematics", "science", "algorithms"]
 
 [dependencies]
-echidna = { version = "0.4.0", path = "..", features = ["bytecode"] }
+echidna = { version = "0.4.1", path = "..", features = ["bytecode"] }
 num-traits = "0.2"
 faer = { version = "0.24", optional = true }
 
diff --git a/echidna-optim/src/solvers/lbfgs.rs b/echidna-optim/src/solvers/lbfgs.rs
index 452ffed..966edb1 100644
--- a/echidna-optim/src/solvers/lbfgs.rs
+++ b/echidna-optim/src/solvers/lbfgs.rs
@@ -51,9 +51,9 @@ pub fn lbfgs<F: Float, O: Objective<F>>(
     if config.memory == 0 || config.convergence.max_iter == 0 {
         return OptimResult {
             x: x0.to_vec(),
-            value: F::zero(),
-            gradient: vec![F::zero(); n],
-            gradient_norm: F::zero(),
+            value: F::nan(),
+            gradient: vec![F::nan(); n],
+            gradient_norm: F::nan(),
             iterations: 0,
             func_evals: 0,
             termination: TerminationReason::NumericalError,
diff --git a/echidna-optim/src/solvers/newton.rs b/echidna-optim/src/solvers/newton.rs
index f2f19f8..f2636e5 100644
--- a/echidna-optim/src/solvers/newton.rs
+++ b/echidna-optim/src/solvers/newton.rs
@@ -49,9 +49,9 @@ pub fn newton<F: Float, O: Objective<F>>(
     if config.convergence.max_iter == 0 {
         return OptimResult {
             x: x0.to_vec(),
-            value: F::zero(),
-            gradient: vec![F::zero(); n],
-            gradient_norm: F::zero(),
+            value: F::nan(),
+            gradient: vec![F::nan(); n],
+            gradient_norm: F::nan(),
             iterations: 0,
             func_evals: 0,
             termination: TerminationReason::NumericalError,
diff --git a/echidna-optim/src/solvers/trust_region.rs b/echidna-optim/src/solvers/trust_region.rs
index ddbc944..fd415c4 100644
--- a/echidna-optim/src/solvers/trust_region.rs
+++ b/echidna-optim/src/solvers/trust_region.rs
@@ -62,9 +62,9 @@ pub fn trust_region<F: Float, O: Objective<F>>(
     {
         return OptimResult {
             x: x0.to_vec(),
-            value: F::zero(),
-            gradient: vec![F::zero(); n],
-            gradient_norm: F::zero(),
+            value: F::nan(),
+            gradient: vec![F::nan(); n],
+            gradient_norm: F::nan(),
             iterations: 0,
             func_evals: 0,
             termination: TerminationReason::NumericalError,
diff --git a/src/api.rs b/src/api.rs
index 3674a2a..1b30a6e 100644
--- a/src/api.rs
+++ b/src/api.rs
@@ -43,6 +43,12 @@ pub fn grad<F: Float + TapeThreadLocal>(
     let output = f(&inputs);
     drop(guard);
 
+    // If the output is a constant (independent of all inputs), the gradient is zero.
+    if output.index == crate::tape::CONSTANT {
+        Tape::return_to_pool(tape);
+        return vec![F::zero(); n];
+    }
+
     // Run reverse sweep.
     let adjoints = tape.reverse(output.index);
 
diff --git a/src/bytecode_tape/forward.rs b/src/bytecode_tape/forward.rs
index faeff59..b77b1bd 100644
--- a/src/bytecode_tape/forward.rs
+++ b/src/bytecode_tape/forward.rs
@@ -30,10 +30,13 @@ fn forward_dispatch<F: Float>(
             op => {
                 let [a_idx, b_idx] = arg_indices[i];
                 let a = values[a_idx as usize];
-                let b = if b_idx != UNUSED && op != OpCode::Powi {
+                if op == OpCode::Powi {
+                    let exp = opcode::powi_exp_decode_raw(b_idx);
+                    values[i] = a.powi(exp);
+                    continue;
+                }
+                let b = if b_idx != UNUSED {
                     values[b_idx as usize]
-                } else if op == OpCode::Powi {
-                    F::from(b_idx).unwrap_or_else(|| F::zero())
                 } else {
                     F::zero()
                 };
@@ -128,7 +131,7 @@ impl<F: Float> super::BytecodeTape<F> {
                         branch: if a >= F::zero() { 1 } else { -1 },
                     });
                 }
-                OpCode::Floor | OpCode::Ceil | OpCode::Round | OpCode::Trunc => {
+                OpCode::Floor | OpCode::Ceil | OpCode::Trunc => {
                     // Kink at integer values. switching_value = distance to
                     // nearest integer: zero exactly at kink points, works
                     // symmetrically for both approach directions.
@@ -143,6 +146,19 @@ impl<F: Float> super::BytecodeTape<F> {
                         },
                     });
                 }
+                OpCode::Round => {
+                    // Round has kinks at half-integers (0.5, 1.5, ...),
+                    // not at integers. Shift by 0.5 to measure distance
+                    // to the nearest half-integer.
+                    let half = F::from(0.5).unwrap();
+                    let shifted = a + half;
+                    kinks.push(crate::nonsmooth::KinkEntry {
+                        tape_index: i as u32,
+                        opcode: op,
+                        switching_value: shifted - shifted.round(),
+                        branch: if a - a.floor() < half { 1 } else { -1 },
+                    });
+                }
                 _ => unreachable!(),
             }
         }
diff --git a/src/bytecode_tape/optimize.rs b/src/bytecode_tape/optimize.rs
index 9e66e3e..f0725a9 100644
--- a/src/bytecode_tape/optimize.rs
+++ b/src/bytecode_tape/optimize.rs
@@ -34,7 +34,13 @@ impl<F: Float> super::BytecodeTape<F> {
             if a != UNUSED {
                 stack.push(a);
             }
-            if b != UNUSED && self.opcodes[i] != OpCode::Powi {
+            if self.opcodes[i] == OpCode::Custom {
+                // For Custom ops, b is a callback index, not a tape index.
+                // The actual second operand (for binary custom ops) is in custom_second_args.
+                if let Some(&second_arg) = self.custom_second_args.get(&(idx)) {
+                    stack.push(second_arg);
+                }
+            } else if b != UNUSED && self.opcodes[i] != OpCode::Powi {
                 stack.push(b);
             }
         }
@@ -62,10 +68,13 @@ impl<F: Float> super::BytecodeTape<F> {
                 } else {
                     UNUSED
                 };
-                let rb = if b != UNUSED && self.opcodes[read] != OpCode::Powi {
+                let rb = if b != UNUSED
+                    && self.opcodes[read] != OpCode::Powi
+                    && self.opcodes[read] != OpCode::Custom
+                {
                     remap[b as usize]
                 } else {
-                    b
+                    b // Powi: b is encoded exponent; Custom: b is callback index
                 };
                 self.arg_indices[write] = [ra, rb];
                 write += 1;
@@ -77,6 +86,16 @@ impl<F: Float> super::BytecodeTape<F> {
         self.values.truncate(new_len);
         self.num_variables = new_len as u32;
 
+        // Remap custom_second_args: both keys and values are tape indices.
+        if !self.custom_second_args.is_empty() {
+            self.custom_second_args = self
+                .custom_second_args
+                .iter()
+                .filter(|(&k, _)| reachable[k as usize])
+                .map(|(&k, &v)| (remap[k as usize], remap[v as usize]))
+                .collect();
+        }
+
         remap
     }
 
@@ -97,9 +116,6 @@ impl<F: Float> super::BytecodeTape<F> {
     /// `output_indices` contains only the active outputs (remapped), and
     /// `output_index` is set to the first active output.
     ///
-    /// **Note**: Like `dead_code_elimination`, this does not remap
-    /// `custom_second_args` (pre-existing limitation).
-    ///
     /// # Panics
     /// Panics if `active_outputs` is empty.
     pub fn dead_code_elimination_for_outputs(&mut self, active_outputs: &[u32]) {
@@ -142,14 +158,20 @@ impl<F: Float> super::BytecodeTape<F> {
             }
 
             let [mut a, mut b] = self.arg_indices[i];
-            // Apply remap to args (except Powi exponent in b).
+            // Apply remap to args (except Powi exponent and Custom callback in b).
             a = remap[a as usize];
-            if b != UNUSED && op != OpCode::Powi {
+            if b != UNUSED && op != OpCode::Powi && op != OpCode::Custom {
                 b = remap[b as usize];
             }
             // Update arg_indices with remapped values.
             self.arg_indices[i] = [a, b];
 
+            // Skip CSE for Custom ops: the key doesn't capture custom_second_args,
+            // so binary custom ops with different second operands would be incorrectly merged.
+            if op == OpCode::Custom {
+                continue;
+            }
+
             // Build the canonical key.
             let key = if b == UNUSED {
                 // Unary: hash (op, arg0) only; use UNUSED as placeholder.
@@ -176,14 +198,22 @@ impl<F: Float> super::BytecodeTape<F> {
             }
             let [a, b] = self.arg_indices[i];
             let ra = remap[a as usize];
-            let rb = if b != UNUSED && op != OpCode::Powi {
+            let rb = if b != UNUSED && op != OpCode::Powi && op != OpCode::Custom {
                 remap[b as usize]
             } else {
-                b
+                b // Powi: b is encoded exponent; Custom: b is callback index
             };
             self.arg_indices[i] = [ra, rb];
         }
 
+        // Remap custom_second_args values (keys are not changed by CSE remap,
+        // but the second operand indices may have been CSE'd).
+        if !self.custom_second_args.is_empty() {
+            for v in self.custom_second_args.values_mut() {
+                *v = remap[*v as usize];
+            }
+        }
+
         // Update output indices.
         self.output_index = remap[self.output_index as usize];
         for oi in &mut self.output_indices {
@@ -198,8 +228,7 @@ impl<F: Float> super::BytecodeTape<F> {
     ///
     /// In debug builds, validates internal consistency after optimization.
     pub fn optimize(&mut self) {
-        self.cse();
-        self.dead_code_elimination();
+        self.cse(); // CSE already calls dead_code_elimination() internally.
 
         // Validate internal consistency in debug builds.
         #[cfg(debug_assertions)]
diff --git a/src/bytecode_tape/reverse.rs b/src/bytecode_tape/reverse.rs
index 0c75cfd..21197df 100644
--- a/src/bytecode_tape/reverse.rs
+++ b/src/bytecode_tape/reverse.rs
@@ -59,10 +59,15 @@ impl<F: Float> super::BytecodeTape<F> {
                     adjoints[i] = F::zero();
                     let [a_idx, b_idx] = self.arg_indices[i];
                     let a = values[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                    if op == OpCode::Powi {
+                        let exp = opcode::powi_exp_decode_raw(b_idx);
+                        let n = F::from(exp).unwrap();
+                        let da = n * a.powi(exp - 1);
+                        adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
+                        continue;
+                    }
+                    let b = if b_idx != UNUSED {
                         values[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        F::from(b_idx).unwrap_or_else(|| F::zero())
                     } else {
                         F::zero()
                     };
@@ -74,7 +79,7 @@ impl<F: Float> super::BytecodeTape<F> {
                     };
 
                     adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
-                    if b_idx != UNUSED && op != OpCode::Powi {
+                    if b_idx != UNUSED {
                         adjoints[b_idx as usize] = adjoints[b_idx as usize] + db * adj;
                     }
                 }
diff --git a/src/bytecode_tape/tangent.rs b/src/bytecode_tape/tangent.rs
index 9f10d48..24e2646 100644
--- a/src/bytecode_tape/tangent.rs
+++ b/src/bytecode_tape/tangent.rs
@@ -80,10 +80,13 @@ impl<F: Float> super::BytecodeTape<F> {
                 op => {
                     let [a_idx, b_idx] = self.arg_indices[i];
                     let a = buf[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                    if op == OpCode::Powi {
+                        let exp = opcode::powi_exp_decode_raw(b_idx);
+                        buf[i] = a.powi(exp);
+                        continue;
+                    }
+                    let b = if b_idx != UNUSED {
                         buf[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        T::from(b_idx).unwrap_or_else(|| T::zero())
                     } else {
                         T::zero()
                     };
@@ -170,10 +173,15 @@ impl<F: Float> super::BytecodeTape<F> {
 
                     let [a_idx, b_idx] = self.arg_indices[i];
                     let a = tangent_vals[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                    if op == OpCode::Powi {
+                        let exp = opcode::powi_exp_decode_raw(b_idx);
+                        let n = T::from(exp).unwrap();
+                        let da = n * a.powi(exp - 1);
+                        buf[a_idx as usize] = buf[a_idx as usize] + da * adj;
+                        continue;
+                    }
+                    let b = if b_idx != UNUSED {
                         tangent_vals[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        T::from(b_idx).unwrap_or_else(|| T::zero())
                     } else {
                         T::zero()
                     };
@@ -181,7 +189,7 @@ impl<F: Float> super::BytecodeTape<F> {
                     let (da, db) = opcode::reverse_partials(op, a, b, r);
 
                     buf[a_idx as usize] = buf[a_idx as usize] + da * adj;
-                    if b_idx != UNUSED && op != OpCode::Powi {
+                    if b_idx != UNUSED {
                         buf[b_idx as usize] = buf[b_idx as usize] + db * adj;
                     }
                 }
diff --git a/src/checkpoint.rs b/src/checkpoint.rs
index cc22f12..fadb7f9 100644
--- a/src/checkpoint.rs
+++ b/src/checkpoint.rs
@@ -4,6 +4,8 @@
 //! (Griewank & Walther, 2000) to minimize recomputation for a given
 //! number of checkpoint slots.
 
+use std::collections::HashSet;
+
 use crate::breverse::BReverse;
 use crate::bytecode_tape::{BtapeGuard, BtapeThreadLocal, BytecodeTape};
 use crate::float::Float;
@@ -45,8 +47,11 @@ pub fn grad_checkpointed<F: Float + BtapeThreadLocal>(
 
     let num_checkpoints = num_checkpoints.max(1).min(num_steps);
 
-    // Compute optimal checkpoint positions using Revolve schedule
-    let checkpoint_positions = revolve_schedule(num_steps, num_checkpoints);
+    // Compute optimal checkpoint positions using Revolve schedule.
+    // Note: the position set is O(num_steps) while checkpoint *states* are O(num_checkpoints).
+    let checkpoint_positions: HashSet<usize> = revolve_schedule(num_steps, num_checkpoints)
+        .into_iter()
+        .collect();
 
     // -- Forward pass: run all steps, saving checkpoints --
     let mut checkpoints: Vec<(usize, Vec<F>)> = Vec::with_capacity(num_checkpoints + 1);
@@ -153,7 +158,7 @@ pub fn grad_checkpointed_online<F: Float + BtapeThreadLocal>(
         }
 
         // Thin when buffer is full.
-        if buffer.len() == num_checkpoints {
+        if buffer.len() >= num_checkpoints {
             // Keep buffer[0] (pinned). Among buffer[1..], keep even-indexed entries.
             let tail: Vec<(usize, Vec<F>)> = buffer[1..].iter().step_by(2).cloned().collect();
             buffer.truncate(1);
@@ -244,7 +249,7 @@ pub fn grad_checkpointed_with_hints<F: Float + BtapeThreadLocal>(
     let slot_alloc = largest_remainder_alloc(free, &interval_lengths, total_len);
 
     // Run Revolve on each sub-interval and merge all positions.
-    let mut all_positions: Vec<usize> = required.clone();
+    let mut all_positions: HashSet<usize> = required.iter().copied().collect();
     for (i, &(start, end)) in intervals.iter().enumerate() {
         let sub_steps = end - start;
         let sub_slots = slot_alloc[i];
@@ -254,10 +259,8 @@ pub fn grad_checkpointed_with_hints<F: Float + BtapeThreadLocal>(
             all_positions.extend(sub_positions.iter().map(|&p| p + start));
         }
     }
-    all_positions.sort_unstable();
-    all_positions.dedup();
 
-    // Forward pass using the merged position list.
+    // Forward pass using the merged position set.
     let mut checkpoints: Vec<(usize, Vec<F>)> = Vec::with_capacity(all_positions.len() + 1);
     checkpoints.push((0, x0.to_vec()));
 
@@ -370,7 +373,9 @@ pub fn grad_checkpointed_disk<F: Float + BtapeThreadLocal>(
     let num_checkpoints = num_checkpoints.max(1).min(num_steps);
 
     // Compute optimal checkpoint positions using Revolve schedule.
-    let checkpoint_positions = revolve_schedule(num_steps, num_checkpoints);
+    let checkpoint_positions: HashSet<usize> = revolve_schedule(num_steps, num_checkpoints)
+        .into_iter()
+        .collect();
 
     // Drop guard ensures cleanup even on panic.
     let mut guard = DiskCheckpointGuard { files: Vec::new() };
@@ -405,7 +410,7 @@ pub fn grad_checkpointed_disk<F: Float + BtapeThreadLocal>(
     // Build checkpoint index: sorted list of (step, path) for reading back.
     // Step 0 is always first.
     let mut ckpt_steps: Vec<usize> = vec![0];
-    ckpt_steps.extend(checkpoint_positions.iter().filter(|&&p| p < num_steps));
+    ckpt_steps.extend(checkpoint_positions.iter().filter(|&p| *p < num_steps));
     ckpt_steps.sort_unstable();
     ckpt_steps.dedup();
 
diff --git a/src/cross_country.rs b/src/cross_country.rs
index fa9c50b..83fa5ed 100644
--- a/src/cross_country.rs
+++ b/src/cross_country.rs
@@ -77,10 +77,18 @@ impl<F: Float> LinearizedGraph<F> {
                 op => {
                     let [a_idx, b_idx] = arg_indices[i];
                     let a = values[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                    if op == OpCode::Powi {
+                        let exp = opcode::powi_exp_decode_raw(b_idx);
+                        let n = F::from(exp).unwrap();
+                        let da = n * a.powi(exp - 1);
+                        if opcodes[a_idx as usize] != OpCode::Const && da != zero {
+                            preds[i].push((a_idx, da));
+                            succs[a_idx as usize].push((i as u32, da));
+                        }
+                        continue;
+                    }
+                    let b = if b_idx != UNUSED {
                         values[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        F::from(b_idx).unwrap_or(zero)
                     } else {
                         zero
                     };
@@ -93,12 +101,8 @@ impl<F: Float> LinearizedGraph<F> {
                         succs[a_idx as usize].push((i as u32, da));
                     }
 
-                    // Edge from second argument (binary ops only, not Powi)
-                    if b_idx != UNUSED
-                        && op != OpCode::Powi
-                        && opcodes[b_idx as usize] != OpCode::Const
-                        && db != zero
-                    {
+                    // Edge from second argument (binary ops only)
+                    if b_idx != UNUSED && opcodes[b_idx as usize] != OpCode::Const && db != zero {
                         preds[i].push((b_idx, db));
                         succs[b_idx as usize].push((i as u32, db));
                     }
diff --git a/src/dual.rs b/src/dual.rs
index 879aafb..b18f036 100644
--- a/src/dual.rs
+++ b/src/dual.rs
@@ -85,7 +85,7 @@ impl<F: Float> Dual<F> {
     #[inline]
     pub fn powi(self, n: i32) -> Self {
         let val = self.re.powi(n);
-        let deriv = F::from(n).unwrap() * self.re.powi(n - 1);
+        let deriv = F::from(n).unwrap() * self.re.powf(F::from(n as i64 - 1).unwrap());
         self.chain(val, deriv)
     }
 
@@ -315,7 +315,11 @@ impl<F: Float> Dual<F> {
         let h = self.re.hypot(other.re);
         Dual {
             re: h,
-            eps: (self.re * self.eps + other.re * other.eps) / h,
+            eps: if h == F::zero() {
+                F::zero()
+            } else {
+                (self.re * self.eps + other.re * other.eps) / h
+            },
         }
     }
 
diff --git a/src/dual_vec.rs b/src/dual_vec.rs
index 4e6e74e..873d221 100644
--- a/src/dual_vec.rs
+++ b/src/dual_vec.rs
@@ -106,7 +106,7 @@ impl<F: Float, const N: usize> DualVec<F, N> {
     #[inline]
     pub fn powi(self, n: i32) -> Self {
         let val = self.re.powi(n);
-        let deriv = F::from(n).unwrap() * self.re.powi(n - 1);
+        let deriv = F::from(n).unwrap() * self.re.powf(F::from(n as i64 - 1).unwrap());
         self.chain(val, deriv)
     }
 
@@ -335,7 +335,11 @@ impl<F: Float, const N: usize> DualVec<F, N> {
         let h = self.re.hypot(other.re);
         DualVec {
             re: h,
-            eps: std::array::from_fn(|k| (self.re * self.eps[k] + other.re * other.eps[k]) / h),
+            eps: if h == F::zero() {
+                [F::zero(); N]
+            } else {
+                std::array::from_fn(|k| (self.re * self.eps[k] + other.re * other.eps[k]) / h)
+            },
         }
     }
 
diff --git a/src/laurent.rs b/src/laurent.rs
index 41674fb..d47fe95 100644
--- a/src/laurent.rs
+++ b/src/laurent.rs
@@ -289,15 +289,17 @@ impl<F: Float, const K: usize> Laurent<F, K> {
                 pole_order: self.pole_order / 2,
             }
         } else if self.pole_order > 0 {
-            let tc = self.as_taylor_coeffs();
+            if self.pole_order % 2 != 0 {
+                return Self::nan_laurent();
+            }
+            // pole_order is even: sqrt of t^p * f(t) = t^(p/2) * sqrt(f(t))
+            // where f(t) has coeffs[0] != 0 (due to normalization).
             let mut c = [F::zero(); K];
-            taylor_ops::taylor_sqrt(&tc, &mut c);
-            let mut l = Laurent {
+            taylor_ops::taylor_sqrt(&self.coeffs, &mut c);
+            Laurent {
                 coeffs: c,
-                pole_order: 0,
-            };
-            l.normalize();
-            l
+                pole_order: self.pole_order / 2,
+            }
         } else {
             let mut c = [F::zero(); K];
             taylor_ops::taylor_sqrt(&self.coeffs, &mut c);
@@ -326,17 +328,19 @@ impl<F: Float, const K: usize> Laurent<F, K> {
                 pole_order: self.pole_order / 3,
             }
         } else if self.pole_order > 0 {
-            let tc = self.as_taylor_coeffs();
+            if self.pole_order % 3 != 0 {
+                return Self::nan_laurent();
+            }
+            // pole_order divisible by 3: cbrt of t^p * f(t) = t^(p/3) * cbrt(f(t))
+            // where f(t) has coeffs[0] != 0 (due to normalization).
             let mut c = [F::zero(); K];
             let mut s1 = [F::zero(); K];
             let mut s2 = [F::zero(); K];
-            taylor_ops::taylor_cbrt(&tc, &mut c, &mut s1, &mut s2);
-            let mut l = Laurent {
+            taylor_ops::taylor_cbrt(&self.coeffs, &mut c, &mut s1, &mut s2);
+            Laurent {
                 coeffs: c,
-                pole_order: 0,
-            };
-            l.normalize();
-            l
+                pole_order: self.pole_order / 3,
+            }
         } else {
             let mut c = [F::zero(); K];
             let mut s1 = [F::zero(); K];
diff --git a/src/opcode.rs b/src/opcode.rs
index 95291cd..07ffe10 100644
--- a/src/opcode.rs
+++ b/src/opcode.rs
@@ -276,7 +276,7 @@ pub fn reverse_partials<T: Float>(op: OpCode, a: T, b: T, r: T) -> (T, T) {
             let inv = one / b;
             (inv, -a * inv * inv)
         }
-        OpCode::Rem => (one, zero),
+        OpCode::Rem => (one, -(a / b).trunc()),
         OpCode::Powf => {
             // d/da a^b = b * a^(b-1)
             // d/db a^b = a^b * ln(a)
@@ -382,6 +382,16 @@ fn powi_exp_decode<T: Float>(b: T) -> i32 {
     b.to_u32().unwrap_or(0) as i32
 }
 
+/// Decode a `powi` exponent directly from the raw `u32` in `arg_indices[1]`.
+///
+/// Avoids the float round-trip of `powi_exp_decode`, which silently fails
+/// for f32 when the u32 encoding exceeds 2^24 (any negative exponent).
+#[inline]
+#[must_use]
+pub fn powi_exp_decode_raw(b_idx: u32) -> i32 {
+    b_idx as i32
+}
+
 /// Encode a `powi` exponent as a value that can be stored in `arg_indices[1]`.
 #[inline]
 #[must_use]
diff --git a/src/sparse.rs b/src/sparse.rs
index cb8df5b..9df5ae3 100644
--- a/src/sparse.rs
+++ b/src/sparse.rs
@@ -120,6 +120,33 @@ pub(crate) fn detect_sparsity_impl(
                                 interactions.push((r, c));
                             }
                         }
+                        // For non-Mul binary ops (Div, Powf, Atan2, Hypot, Custom),
+                        // within-operand second derivatives are nonzero.
+                        // E.g. d^2(a/b)/db^2 = 2a/b^3, so variables flowing into
+                        // the same operand interact.
+                        // Mul is excluded: d^2(a*b)/da^2 = 0 (bilinear).
+                        if op != OpCode::Mul {
+                            for ii in 0..bits_a.len() {
+                                for jj in 0..=ii {
+                                    let (r, c) = if bits_a[ii] >= bits_a[jj] {
+                                        (bits_a[ii], bits_a[jj])
+                                    } else {
+                                        (bits_a[jj], bits_a[ii])
+                                    };
+                                    interactions.push((r, c));
+                                }
+                            }
+                            for ii in 0..bits_b.len() {
+                                for jj in 0..=ii {
+                                    let (r, c) = if bits_b[ii] >= bits_b[jj] {
+                                        (bits_b[ii], bits_b[jj])
+                                    } else {
+                                        (bits_b[jj], bits_b[ii])
+                                    };
+                                    interactions.push((r, c));
+                                }
+                            }
+                        }
                     }
                     OpClass::ZeroDerivative => {
                         // Propagate dependencies for downstream ops
diff --git a/src/stde/higher_order.rs b/src/stde/higher_order.rs
index 3d7e96b..dd2e609 100644
--- a/src/stde/higher_order.rs
+++ b/src/stde/higher_order.rs
@@ -203,12 +203,13 @@ pub fn diagonal_kth_order_stochastic<F: Float + TaylorArenaLocal>(
 
     let (mean, sample_variance, standard_error) = acc.finalize();
 
-    // Unbiased estimator for Σ_j d_j: n * mean(sampled d_j's)
+    // Unbiased estimator for Σ_j d_j: n * mean(sampled d_j's).
+    // Scale variance/SE to match the rescaled estimate.
     EstimatorResult {
         value,
         estimate: mean * nf,
-        sample_variance,
-        standard_error,
+        sample_variance: sample_variance * nf * nf,
+        standard_error: standard_error * nf,
         num_samples: sampled_indices.len(),
     }
 }
diff --git a/src/stde/pde.rs b/src/stde/pde.rs
index e9bfc0d..e60554e 100644
--- a/src/stde/pde.rs
+++ b/src/stde/pde.rs
@@ -171,12 +171,14 @@ pub fn parabolic_diffusion_stochastic<F: Float>(
 
     let (mean, sample_variance, standard_error) = acc.finalize();
 
-    // Unbiased estimator for ½ Σ_i (σ·e_i)^T H (σ·e_i): d * mean * ½
+    // Unbiased estimator for ½ Σ_i (σ·e_i)^T H (σ·e_i): d * mean * ½.
+    // Scale variance/SE to match the rescaled estimate.
+    let scale = df * half;
     EstimatorResult {
         value,
-        estimate: mean * df * half,
-        sample_variance,
-        standard_error,
+        estimate: mean * scale,
+        sample_variance: sample_variance * scale * scale,
+        standard_error: standard_error * scale,
         num_samples: sampled_indices.len(),
     }
 }
diff --git a/src/taylor_ops.rs b/src/taylor_ops.rs
index 21f89f7..3baa212 100644
--- a/src/taylor_ops.rs
+++ b/src/taylor_ops.rs
@@ -442,12 +442,14 @@ pub fn taylor_powf<F: Float>(
 
 /// `c = a^n` (powi) — integer power.
 ///
-/// Uses `scratch1` for `ln(a)` and `scratch2` for `n * ln(a)`.
+/// Dispatches between two strategies:
+/// - **Repeated squaring** (binary exponentiation via `taylor_mul`): used when
+///   `a[0] < 0` (where `ln` would produce NaN) or `|n| <= 8` (at most 3
+///   multiplications, competitive with exp-ln).
+/// - **exp(n * ln(a))**: used for positive base with large exponents.
 #[inline]
 pub fn taylor_powi<F: Float>(a: &[F], n: i32, c: &mut [F], scratch1: &mut [F], scratch2: &mut [F]) {
     let deg = c.len();
-    // For small integer powers, direct computation may be more efficient,
-    // but for generality use exp(n * ln(a)).
     if n == 0 {
         c[0] = F::one();
         for ck in c[1..deg].iter_mut() {
@@ -459,14 +461,76 @@ pub fn taylor_powi<F: Float>(a: &[F], n: i32, c: &mut [F], scratch1: &mut [F], s
         c.copy_from_slice(a);
         return;
     }
-    // scratch1 = ln(a)
-    taylor_ln(a, scratch1);
-    // scratch2 = n * ln(a)
-    let nf = F::from(n).unwrap();
-    taylor_scale(scratch1, nf, scratch2);
-    // c = exp(n * ln(a))
-    taylor_exp(scratch2, c);
-    c[0] = a[0].powi(n);
+    if n == -1 {
+        taylor_recip(a, c);
+        return;
+    }
+    if a[0] < F::zero() || n.unsigned_abs() <= 8 {
+        taylor_powi_squaring(a, n, c, scratch1, scratch2);
+    } else {
+        // scratch1 = ln(a)
+        taylor_ln(a, scratch1);
+        // scratch2 = n * ln(a)
+        let nf = F::from(n).unwrap();
+        taylor_scale(scratch1, nf, scratch2);
+        // c = exp(n * ln(a))
+        taylor_exp(scratch2, c);
+        c[0] = a[0].powi(n);
+    }
+}
+
+/// Integer power via binary exponentiation on Taylor coefficient arrays.
+///
+/// Computes `a^n` using repeated squaring with `taylor_mul`. Works correctly
+/// for negative base values (unlike the exp-ln path). For negative `n`,
+/// computes `a^|n|` then takes the reciprocal.
+fn taylor_powi_squaring<F: Float>(
+    a: &[F],
+    n: i32,
+    c: &mut [F],
+    scratch1: &mut [F],
+    scratch2: &mut [F],
+) {
+    let deg = c.len();
+    let abs_n = n.unsigned_abs();
+
+    // result (c) = 1
+    c[0] = F::one();
+    for ck in c[1..deg].iter_mut() {
+        *ck = F::zero();
+    }
+
+    // base (scratch1) = a
+    scratch1[..deg].copy_from_slice(&a[..deg]);
+
+    let mut power = abs_n;
+    while power > 0 {
+        if power & 1 == 1 {
+            // result = result * base
+            taylor_mul(c, &*scratch1, scratch2);
+            c[..deg].copy_from_slice(&scratch2[..deg]);
+        }
+        power >>= 1;
+        if power > 0 {
+            // base = base * base
+            let base_ref: &[F] = &*scratch1;
+            // Inline squaring to avoid borrow conflict (scratch1 is both source and dest)
+            for k in 0..deg {
+                let mut sum = F::zero();
+                for j in 0..=k {
+                    sum = sum + base_ref[j] * base_ref[k - j];
+                }
+                scratch2[k] = sum;
+            }
+            scratch1[..deg].copy_from_slice(&scratch2[..deg]);
+        }
+    }
+
+    if n < 0 {
+        // c = 1/c: copy c into scratch1, then compute recip into c
+        scratch1[..deg].copy_from_slice(&c[..deg]);
+        taylor_recip(scratch1, c);
+    }
 }
 
 /// `c = cbrt(a) = a^(1/3)`.
diff --git a/src/traits/breverse_ops.rs b/src/traits/breverse_ops.rs
index 0343063..ea67625 100644
--- a/src/traits/breverse_ops.rs
+++ b/src/traits/breverse_ops.rs
@@ -265,11 +265,11 @@ macro_rules! impl_breverse_scalar_ops {
             #[inline]
             fn rem(self, rhs: BReverse<$f>) -> BReverse<$f> {
                 let value = self % rhs.value;
-                // Constant % variable has zero derivative — treat as constant.
-                BReverse {
-                    value,
-                    index: CONSTANT,
-                }
+                let index = bytecode_tape::with_active_btape(|t| {
+                    let c = t.push_const(self);
+                    t.push_op(OpCode::Rem, c, rhs.index, value)
+                });
+                BReverse { value, index }
             }
         }
     };
diff --git a/src/traits/dual_vec_ops.rs b/src/traits/dual_vec_ops.rs
index 2f1f702..ca72d00 100644
--- a/src/traits/dual_vec_ops.rs
+++ b/src/traits/dual_vec_ops.rs
@@ -67,9 +67,10 @@ impl<F: Float, const N: usize> Rem for DualVec<F, N> {
     type Output = Self;
     #[inline]
     fn rem(self, rhs: Self) -> Self {
+        let q = (self.re / rhs.re).trunc();
         DualVec {
             re: self.re % rhs.re,
-            eps: self.eps,
+            eps: std::array::from_fn(|k| self.eps[k] - rhs.eps[k] * q),
         }
     }
 }
@@ -217,9 +218,10 @@ macro_rules! impl_dual_vec_scalar_ops {
             type Output = DualVec<$f, N>;
             #[inline]
             fn rem(self, rhs: DualVec<$f, N>) -> DualVec<$f, N> {
+                let q = (self / rhs.re).trunc();
                 DualVec {
                     re: self % rhs.re,
-                    eps: [<$f>::from(0.0); N],
+                    eps: std::array::from_fn(|k| -rhs.eps[k] * q),
                 }
             }
         }
diff --git a/src/traits/num_traits_impls.rs b/src/traits/num_traits_impls.rs
index 9a7347a..f692eff 100644
--- a/src/traits/num_traits_impls.rs
+++ b/src/traits/num_traits_impls.rs
@@ -644,7 +644,7 @@ impl<F: Float + TapeThreadLocal> NumFloat for Reverse<F> {
 
     fn powi(self, n: i32) -> Self {
         let val = self.value.powi(n);
-        let deriv = F::from(n).unwrap() * self.value.powi(n - 1);
+        let deriv = F::from(n).unwrap() * self.value.powf(F::from(n as i64 - 1).unwrap());
         rev_unary(self, val, deriv)
     }
 
@@ -793,8 +793,11 @@ impl<F: Float + TapeThreadLocal> NumFloat for Reverse<F> {
 
     fn hypot(self, other: Self) -> Self {
         let h = self.value.hypot(other.value);
-        let dx = self.value / h;
-        let dy = other.value / h;
+        let (dx, dy) = if h == F::zero() {
+            (F::zero(), F::zero())
+        } else {
+            (self.value / h, other.value / h)
+        };
         rev_binary(self, other, h, dx, dy)
     }
 
diff --git a/src/traits/std_ops.rs b/src/traits/std_ops.rs
index 71ddb39..e6a6897 100644
--- a/src/traits/std_ops.rs
+++ b/src/traits/std_ops.rs
@@ -5,7 +5,7 @@ use std::ops::{
 use crate::dual::Dual;
 use crate::float::Float;
 use crate::reverse::Reverse;
-use crate::tape::{self, TapeThreadLocal, CONSTANT};
+use crate::tape::{self, TapeThreadLocal};
 
 // ──────────────────────────────────────────────
 //  Dual<F> operators
@@ -73,7 +73,7 @@ impl<F: Float> Rem for Dual<F> {
     fn rem(self, rhs: Self) -> Self {
         Dual {
             re: self.re % rhs.re,
-            eps: self.eps,
+            eps: self.eps - rhs.eps * (self.re / rhs.re).trunc(),
         }
     }
 }
@@ -222,9 +222,10 @@ macro_rules! impl_dual_scalar_ops {
             type Output = Dual<$f>;
             #[inline]
             fn rem(self, rhs: Dual<$f>) -> Dual<$f> {
+                let q = (self / rhs.re).trunc();
                 Dual {
                     re: self % rhs.re,
-                    eps: <$f>::from(0.0),
+                    eps: -rhs.eps * q,
                 }
             }
         }
@@ -313,8 +314,8 @@ impl<F: Float + TapeThreadLocal> Rem for Reverse<F> {
     #[inline]
     fn rem(self, rhs: Self) -> Self {
         let value = self.value % rhs.value;
-        let index =
-            tape::with_active_tape(|t| t.push_binary(self.index, F::one(), rhs.index, F::zero()));
+        let q = (self.value / rhs.value).trunc();
+        let index = tape::with_active_tape(|t| t.push_binary(self.index, F::one(), rhs.index, -q));
         Reverse { value, index }
     }
 }
@@ -454,11 +455,9 @@ macro_rules! impl_reverse_scalar_ops {
             #[inline]
             fn rem(self, rhs: Reverse<$f>) -> Reverse<$f> {
                 let value = self % rhs.value;
-                // Constant remainder w.r.t. denominator has zero derivative.
-                Reverse {
-                    value,
-                    index: CONSTANT,
-                }
+                let q = (self / rhs.value).trunc();
+                let index = tape::with_active_tape(|t| t.push_unary(rhs.index, -q));
+                Reverse { value, index }
             }
         }
     };
diff --git a/src/traits/taylor_std_ops.rs b/src/traits/taylor_std_ops.rs
index bcf2892..c399100 100644
--- a/src/traits/taylor_std_ops.rs
+++ b/src/traits/taylor_std_ops.rs
@@ -73,13 +73,15 @@ impl<F: Float, const K: usize> Neg for Taylor<F, K> {
 impl<F: Float, const K: usize> Rem for Taylor<F, K> {
     type Output = Self;
     #[inline]
+    #[allow(clippy::suspicious_arithmetic_impl)]
     fn rem(self, rhs: Self) -> Self {
+        let q = (self.coeffs[0] / rhs.coeffs[0]).trunc();
         Taylor {
             coeffs: std::array::from_fn(|k| {
                 if k == 0 {
                     self.coeffs[0] % rhs.coeffs[0]
                 } else {
-                    self.coeffs[k]
+                    self.coeffs[k] - rhs.coeffs[k] * q
                 }
             }),
         }
@@ -311,7 +313,10 @@ impl<F: Float + TaylorArenaLocal> Rem for TaylorDyn<F> {
     fn rem(self, rhs: Self) -> Self {
         TaylorDyn::binary_op(&self, &rhs, |a, b, c| {
             c[0] = a[0] % b[0];
-            c[1..].copy_from_slice(&a[1..]);
+            let q = (a[0] / b[0]).trunc();
+            for k in 1..c.len() {
+                c[k] = a[k] - b[k] * q;
+            }
         })
     }
 }
diff --git a/tests/nonsmooth.rs b/tests/nonsmooth.rs
index c5459ef..586d39d 100644
--- a/tests/nonsmooth.rs
+++ b/tests/nonsmooth.rs
@@ -280,14 +280,33 @@ fn forward_nonsmooth_detects_ceil_kink() {
 
 #[test]
 fn forward_nonsmooth_detects_round_trunc() {
-    // round(x) and trunc(x) near integers → kinks detected
+    // Trunc has kinks at integers, Round has kinks at half-integers.
+    // Use x = 4.001 (near integer 4) for trunc, x = 3.501 (near half-integer 3.5) for round.
+
+    // Test trunc near integer
+    let (mut tape, _) = record(|x| x[0].trunc(), &[4.001]);
+    let info = tape.forward_nonsmooth(&[4.001]);
+    assert_eq!(info.kinks.len(), 1);
+    assert_eq!(info.kinks[0].opcode, OpCode::Trunc);
+    assert_eq!(info.active_kinks(0.01).len(), 1);
+
+    // Test round near half-integer
+    let (mut tape, _) = record(|x| x[0].round(), &[3.501]);
+    let info = tape.forward_nonsmooth(&[3.501]);
+    assert_eq!(info.kinks.len(), 1);
+    assert_eq!(info.kinks[0].opcode, OpCode::Round);
+    assert_eq!(info.active_kinks(0.01).len(), 1);
+
+    // Both in one expression: use a value near both a half-integer and an integer
+    // is impossible (they're 0.5 apart), so test that both kinks are detected
+    // even when only one is active.
     let (mut tape, _) = record(|x| x[0].round() + x[0].trunc(), &[4.001]);
     let info = tape.forward_nonsmooth(&[4.001]);
     assert_eq!(info.kinks.len(), 2);
     assert_eq!(info.kinks[0].opcode, OpCode::Round);
     assert_eq!(info.kinks[1].opcode, OpCode::Trunc);
-    // Both near integer → active
-    assert_eq!(info.active_kinks(0.01).len(), 2);
+    // Only trunc is near its kink (integer); round's kink is at half-integers
+    assert_eq!(info.active_kinks(0.01).len(), 1);
 }
 
 #[test]
diff --git a/tests/opcode_edge_cases.rs b/tests/opcode_edge_cases.rs
index 10a66fd..27a4e36 100644
--- a/tests/opcode_edge_cases.rs
+++ b/tests/opcode_edge_cases.rs
@@ -1,8 +1,10 @@
 #![cfg(feature = "bytecode")]
 
 use echidna::opcode::{
-    eval_forward, forced_reverse_partials, powi_exp_encode, reverse_partials, OpCode,
+    eval_forward, forced_reverse_partials, powi_exp_decode_raw, powi_exp_encode, reverse_partials,
+    OpCode,
 };
+use num_traits::Float;
 
 const TOL: f64 = 1e-10;
 
@@ -406,3 +408,59 @@ fn powi_encode_zero() {
 fn powi_encode_negative() {
     assert_eq!(powi_exp_encode(-1), u32::MAX);
 }
+
+// ══════════════════════════════════════════════
+//  powi_exp_decode_raw round-trip
+// ══════════════════════════════════════════════
+
+#[test]
+fn powi_decode_raw_positive() {
+    assert_eq!(powi_exp_decode_raw(powi_exp_encode(5)), 5);
+}
+
+#[test]
+fn powi_decode_raw_zero() {
+    assert_eq!(powi_exp_decode_raw(powi_exp_encode(0)), 0);
+}
+
+#[test]
+fn powi_decode_raw_negative() {
+    assert_eq!(powi_exp_decode_raw(powi_exp_encode(-1)), -1);
+    assert_eq!(powi_exp_decode_raw(powi_exp_encode(-2)), -2);
+    assert_eq!(powi_exp_decode_raw(powi_exp_encode(-10)), -10);
+    assert_eq!(powi_exp_decode_raw(powi_exp_encode(i32::MIN)), i32::MIN);
+}
+
+// ══════════════════════════════════════════════
+//  f32 powi tape: value + gradient (regression for the f32 exponent encoding bug)
+// ══════════════════════════════════════════════
+
+#[test]
+fn f32_tape_powi_negative_2() {
+    // x^(-2) at x=2: value=0.25, gradient=-2*2^(-3)=-0.25
+    let (mut tape, _) = echidna::record(|x: &[echidna::BReverse<f32>]| x[0].powi(-2), &[2.0f32]);
+    let grad = tape.gradient(&[2.0f32]);
+    assert!((tape.values_slice()[tape.output_index()] - 0.25f32).abs() < 1e-6);
+    assert!((grad[0] - (-0.25f32)).abs() < 1e-6);
+}
+
+#[test]
+fn f32_tape_powi_negative_10() {
+    // x^(-10) at x=2: value=1/1024, gradient=-10*2^(-11)=-10/2048
+    let expected_val: f32 = 2.0f32.powi(-10);
+    let expected_grad: f32 = -10.0 * 2.0f32.powi(-11);
+    let (mut tape, _) = echidna::record(|x: &[echidna::BReverse<f32>]| x[0].powi(-10), &[2.0f32]);
+    let grad = tape.gradient(&[2.0f32]);
+    assert!(
+        (tape.values_slice()[tape.output_index()] - expected_val).abs() < 1e-8,
+        "value: got {}, expected {}",
+        tape.values_slice()[tape.output_index()],
+        expected_val
+    );
+    assert!(
+        (grad[0] - expected_grad).abs() < 1e-8,
+        "grad: got {}, expected {}",
+        grad[0],
+        expected_grad
+    );
+}
diff --git a/tests/taylor.rs b/tests/taylor.rs
index 11593e5..25a8eea 100644
--- a/tests/taylor.rs
+++ b/tests/taylor.rs
@@ -808,3 +808,66 @@ fn taylor_dyn_through_scalar_generic() {
     let result = ad_generic_rosenbrock(x, y);
     assert_relative_eq!(result.value(), 0.0, epsilon = 1e-10);
 }
+
+// ══════════════════════════════════════════════
+//  powi with negative base (regression test)
+// ══════════════════════════════════════════════
+
+#[test]
+fn taylor_powi_negative_base_squared() {
+    // f(t) = (-2 + t)^2 = 4 - 4t + t^2
+    // Coefficients: [4, -4, 1, 0]
+    let x = Taylor::<f64, 4>::variable(-2.0);
+    let result = x.powi(2);
+    assert_relative_eq!(result.coeffs[0], 4.0, epsilon = 1e-12);
+    assert_relative_eq!(result.coeffs[1], -4.0, epsilon = 1e-12);
+    assert_relative_eq!(result.coeffs[2], 1.0, epsilon = 1e-12);
+    assert_relative_eq!(result.coeffs[3], 0.0, epsilon = 1e-12);
+}
+
+#[test]
+fn taylor_powi_negative_base_cubed() {
+    // f(t) = (-2 + t)^3 = -8 + 12t - 6t^2 + t^3
+    // Coefficients: [-8, 12, -6, 1]
+    let x = Taylor::<f64, 4>::variable(-2.0);
+    let result = x.powi(3);
+    assert_relative_eq!(result.coeffs[0], -8.0, epsilon = 1e-10);
+    assert_relative_eq!(result.coeffs[1], 12.0, epsilon = 1e-10);
+    assert_relative_eq!(result.coeffs[2], -6.0, epsilon = 1e-10);
+    assert_relative_eq!(result.coeffs[3], 1.0, epsilon = 1e-10);
+}
+
+#[test]
+fn taylor_powi_negative_base_fourth_power() {
+    // f(t) = (-3 + t)^4 = 81 - 108t + 54t^2 - 12t^3 + t^4
+    let x = Taylor::<f64, 5>::variable(-3.0);
+    let result = x.powi(4);
+    assert_relative_eq!(result.coeffs[0], 81.0, epsilon = 1e-10);
+    assert_relative_eq!(result.coeffs[1], -108.0, epsilon = 1e-10);
+    assert_relative_eq!(result.coeffs[2], 54.0, epsilon = 1e-10);
+    assert_relative_eq!(result.coeffs[3], -12.0, epsilon = 1e-10);
+    assert_relative_eq!(result.coeffs[4], 1.0, epsilon = 1e-10);
+}
+
+#[test]
+fn taylor_powi_negative_base_negative_exponent() {
+    // f(t) = (-2 + t)^(-1) at t=0:
+    // f(0) = -0.5, f'(0) = -(-2)^(-2) = -0.25, f''(0)/2 = -(-2)^(-3) = -0.125
+    let x = Taylor::<f64, 4>::variable(-2.0);
+    let result = x.powi(-1);
+    assert_relative_eq!(result.coeffs[0], -0.5, epsilon = 1e-12);
+    assert_relative_eq!(result.coeffs[1], -0.25, epsilon = 1e-12);
+    assert_relative_eq!(result.coeffs[2], -0.125, epsilon = 1e-12);
+    assert_relative_eq!(result.coeffs[3], -0.0625, epsilon = 1e-12);
+}
+
+#[test]
+fn taylor_powi_positive_base_still_works() {
+    // Regression: positive base should still use the efficient exp-ln path for large exponents.
+    // f(t) = (2 + t)^10 at t=0: c[0] = 1024
+    let x = Taylor::<f64, 3>::variable(2.0);
+    let result = x.powi(10);
+    assert_relative_eq!(result.coeffs[0], 1024.0, epsilon = 1e-8);
+    // f'(0) = 10 * 2^9 = 5120
+    assert_relative_eq!(result.coeffs[1], 5120.0, epsilon = 1e-6);
+}