leanEthereum · TomWambsgans · Apr 22, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/crates/backend/poly/src/eq_mle.rs b/crates/backend/poly/src/eq_mle.rs
@@ -307,11 +307,8 @@ pub fn compute_eval_eq_base_packed<F, EF, const INITIALIZED: bool>(
 }
 
 #[inline]
-pub fn compute_eval_eq_base_packed_batched<F, EF>(
-    evals: &[MultilinearPoint<F>],
-    out: &mut [EF::ExtensionPacking],
-    scalars: &[EF],
-) where
+pub fn compute_eval_eq_base_batched<F, EF>(evals: &[MultilinearPoint<F>], out: &mut [EF], scalars: &[EF])
+where
     F: Field,
     EF: ExtensionField<F>,
 {
@@ -321,22 +318,21 @@ pub fn compute_eval_eq_base_packed_batched<F, EF>(
     }
 
     let n = evals[0].len();
-    let packing_width = F::Packing::WIDTH;
-    let log_packing_width = log2_strict_usize(packing_width);
+    let log_packing_width = log2_strict_usize(F::Packing::WIDTH);
     assert!(log_packing_width <= n);
-    assert_eq!(out.len(), 1 << (n - log_packing_width));
+    assert_eq!(out.len(), 1 << n);
 
     let k = n.min(LOG_BATCHED_TILE_SIZE);
 
     if k <= log_packing_width || k >= n {
         for (eval, &scalar) in evals.iter().zip(scalars) {
-            compute_eval_eq_base_packed::<F, EF, true>(eval, out, scalar);
+            compute_eval_eq_base::<F, EF, true>(eval, out, scalar);
         }
         return;
     }
 
     let n_prefix_levels = n - k;
-    let tile_packed_size = 1 << (k - log_packing_width);
+    let tile_size = 1 << k;
 
     let per_query: Vec<_> = evals
         .iter()
@@ -350,19 +346,14 @@ pub fn compute_eval_eq_base_packed_batched<F, EF>(
         })
         .collect();
 
-    // `out` already splits into `2^n_prefix_levels` tiles — many more than there are
-    // workers — so the pool's task counter load-balances these directly.
-    parallel::par_chunks_mut(out, tile_packed_size, |tile_idx, out_tile| {
+    // `out` splits into `2^n_prefix_levels` tiles — many more than there are workers —
+    // so the pool's task counter load-balances these directly.
+    parallel::par_chunks_mut(out, tile_size, |tile_idx, out_tile| {
         for (eq_prefix, middle, eq_suffix) in &per_query {
             // Here e could precompute the eq poly, trading some memory for less computation
             // (2x faster on M4 max, but 2x slower on machines with smaller caches.
             // TODO implement both and choose based on cache size?)
-            base_eval_eq_packed_with_packed_output::<F, EF, true>(
-                middle,
-                out_tile,
-                *eq_suffix,
-                EF::ExtensionPacking::from(eq_prefix[tile_idx]),
-            );
+            base_eval_eq_packed::<F, EF, true>(middle, out_tile, *eq_suffix, eq_prefix[tile_idx]);
         }
     });
 }

diff --git a/crates/backend/poly/src/point.rs b/crates/backend/poly/src/point.rs
@@ -106,6 +106,15 @@ where
     }
 }
 
+impl<F: Clone> MultilinearPoint<F> {
+    #[must_use]
+    pub fn reversed(&self) -> Self {
+        let mut v = self.0.clone();
+        v.reverse();
+        Self(v)
+    }
+}
+
 impl<F> From<Vec<F>> for MultilinearPoint<F> {
     fn from(v: Vec<F>) -> Self {
         Self(v)

diff --git a/crates/rec_aggregation/zkdsl_implem/whir.py b/crates/rec_aggregation/zkdsl_implem/whir.py
@@ -126,17 +126,23 @@ def whir_open(
 
     folding_randomness_global = Array(n_vars * DIM)
 
-    start_buf = Array(n_rounds + 2)
-    start_buf[0] = folding_randomness_global
+    # WHIR sumcheck folds LSB-first, so chronological challenges are in reverse polynomial-var
+    # order: chronological challenge #c is written to global position (n_vars - 1 - c), so the
+    # cumulative reads as [x_0, x_1, ..., x_{n_vars-1}]. `chrono_buf` carries the running
+    # chronological index across the `range` loop (range loops may not mutate outer-scope vars).
+    chrono_buf = Array(n_rounds + 2)
+    chrono_buf[0] = 0
     for i in range(0, n_rounds + 1):
-        start: Mut = start_buf[i]
+        chrono: Mut = chrono_buf[i]
         for j in range(0, folding_factors[i]):
-            copy_5(all_folding_randomness[i] + j * DIM, start + j * DIM)
-        start += folding_factors[i] * DIM
-        start_buf[i + 1] = start
-    start = start_buf[n_rounds + 1]
+            target_pos = n_vars - 1 - (chrono + j)
+            copy_5(all_folding_randomness[i] + j * DIM, folding_randomness_global + target_pos * DIM)
+        chrono += folding_factors[i]
+        chrono_buf[i + 1] = chrono
+    chrono = chrono_buf[n_rounds + 1]
     for j in range(0, n_final_vars):
-        copy_5(all_folding_randomness[n_rounds + 1] + j * DIM, start + j * DIM)
+        target_pos = n_vars - 1 - (chrono + j)
+        copy_5(all_folding_randomness[n_rounds + 1] + j * DIM, folding_randomness_global + target_pos * DIM)
 
     all_ood_recovered_evals = Array(num_oods[0] * DIM)
     for i in range(0, num_oods[0]):
@@ -152,6 +158,9 @@ def whir_open(
         num_oods[0],
     )
 
+    # LSB-fold: at round i the polynomial's remaining vars are [x_0, ..., x_{n_vars_remaining-1}],
+    # i.e. the FIRST n_vars_remaining entries of folding_randomness_global (no pointer advance).
+    # eval_carry carries (n_vars_remaining, folding_randomness ptr, running sum) across the loop.
     eval_carry = Array((n_rounds + 1) * 3)
     eval_carry[0] = n_vars
     eval_carry[1] = folding_randomness_global
@@ -164,12 +173,9 @@ def whir_open(
         n_vars_remaining -= folding_factors[i]
         my_ood_recovered_evals = Array(num_oods[i + 1] * DIM)
         combination_randomness_powers = all_combination_randomness_powers[i]
-        my_folding_randomness += folding_factors[i] * DIM
         for j in range(0, num_oods[i + 1]):
             expanded_from_univariate = expand_from_univariate_ext(all_ood_points[i] + j * DIM, n_vars_remaining)
-            poly_eq_extension_dynamic_to(
-                expanded_from_univariate, my_folding_randomness, my_ood_recovered_evals + j * DIM, n_vars_remaining
-            )
+            poly_eq_extension_dynamic_to(expanded_from_univariate, folding_randomness_global, my_ood_recovered_evals + j * DIM, n_vars_remaining)
         summed_ood = Array(DIM)
         dot_product_ee_dynamic(
             my_ood_recovered_evals,
@@ -182,7 +188,7 @@ def whir_open(
         circle_value_i = all_circle_values[i]
         for j in range(0, num_queries[i]):  # unroll ?
             expanded_from_univariate = expand_from_univariate_base(circle_value_i[j], n_vars_remaining)
-            poly_eq_base_extension_to(expanded_from_univariate, my_folding_randomness, s6s + j * DIM, n_vars_remaining)
+            poly_eq_base_extension_to(expanded_from_univariate, folding_randomness_global, s6s + j * DIM, n_vars_remaining)
         s7 = Array(DIM)
         dot_product_ee_dynamic(
             s6s,
@@ -196,10 +202,18 @@ def whir_open(
         eval_carry[base + 4] = my_folding_randomness
         eval_carry[base + 5] = s
     s = eval_carry[n_rounds * 3 + 2]
+
+    # WHIR sumcheck folds LSB-first: final_sumcheck challenges are [r_1=x_{m-1}, ..., r_m=x_0].
+    # eval_multilinear_coeffs_rev computes f(x_j = point[j]); for LSB-fold we need
+    # f(x_j = r_{m-j}) = point[j] = r_{j+1} = x_{m-j-1} which is wrong, so reverse first.
+    final_sumcheck_chals_rev = Array(n_final_vars * DIM)
+    final_sumcheck_chals = all_folding_randomness[n_rounds + 1]
+    for j in range(0, n_final_vars):
+        copy_5(final_sumcheck_chals + (n_final_vars - 1 - j) * DIM, final_sumcheck_chals_rev + j * DIM)
     final_value = match_range(
         n_final_vars,
         range(MAX_NUM_VARIABLES_TO_SEND_COEFFS - WHIR_SUBSEQUENT_FOLDING_FACTOR, MAX_NUM_VARIABLES_TO_SEND_COEFFS + 1),
-        lambda n: eval_multilinear_coeffs_rev(final_coeffcients, all_folding_randomness[n_rounds + 1], n),
+        lambda n: eval_multilinear_coeffs_rev(final_coeffcients, final_sumcheck_chals_rev, n),
     )
     # copy_5(mul_extension_ret(s, final_value), end_sum);
 
@@ -376,7 +390,12 @@ def sample_stir_indexes_and_fold(
 
     folds = Array(num_queries * DIM)
 
-    poly_eq = compute_eq_mle_extension_dynamic(folding_randomness, folding_factor)
+    # WHIR sumcheck folds LSB-first; the leaf is laid out so its first var is the polynomial's
+    # last LSB-folded var. evaluate (poly_eq) is MSB-first, so reverse the per-round challenges.
+    folding_randomness_reversed = Array(folding_factor * DIM)
+    for j in range(0, folding_factor):
+        copy_5(folding_randomness + (folding_factor - 1 - j) * DIM, folding_randomness_reversed + j * DIM)
+    poly_eq = compute_eq_mle_extension_dynamic(folding_randomness_reversed, folding_factor)
 
     if merkle_leaves_in_basefield == 1:
         for i in range(0, num_queries):

diff --git a/crates/sub_protocols/src/quotient_gkr/layers.rs b/crates/sub_protocols/src/quotient_gkr/layers.rs
@@ -84,7 +84,7 @@ impl<'a, EF: ExtensionField<PF<EF>>> LayerStorage<'a, EF> {
         }
     }
 
-    pub fn materialise_in_full(self) -> (Vec<EF>, Vec<EF>) {
+    pub(super) fn materialise_in_full(self) -> (Vec<EF>, Vec<EF>) {
         let natural = match self {
             Self::Natural { .. } => self,
             other => other.convert_to_natural(),

diff --git a/crates/whir/src/commit.rs b/crates/whir/src/commit.rs
@@ -65,24 +65,25 @@ where
         &self,
         prover_state: &mut impl FSProver<EF>,
         polynomial: &MleOwned<EF>,
-        actual_data_len: usize, // polynomial[actual_data_len..] is zero
+        _actual_data_len: usize, // polynomial[_actual_data_len..] is zero
     ) -> Witness<EF> {
         let n_blocks = 1usize << self.folding_factor.at_round(0);
-        let evals_len = 1usize << self.num_variables;
-        let effective_n_cols = actual_data_len.div_ceil(evals_len / n_blocks);
-        // DFT matrix width: skip as many zero columns as possible, aligned to packing (SIMD)
-        let dft_n_cols = effective_n_cols.next_multiple_of(packing_width::<EF>()).min(n_blocks);
 
+        // NOTE: main's zero-COLUMN skip optimization (dft_n_cols / effective_n_cols < n_blocks)
+        // assumed an MSB-cols matrix layout, where the polynomial's zero suffix lands in trailing
+        // columns. The split-eq LSB-cols layout puts the zero suffix in trailing ROWS instead, so
+        // skipping columns would drop live data. We commit all columns (no skip): same root, just
+        // without the prover-side speedup. (The branch optimized this via row-skip in the DFT.)
         let folded_matrix = info_span!("FFT").in_scope(|| {
             reorder_and_dft(
                 &polynomial.by_ref(),
                 self.folding_factor.at_round(0),
                 self.starting_log_inv_rate,
-                dft_n_cols,
+                n_blocks,
             )
         });
 
-        let (prover_data, root) = MerkleData::build(folded_matrix, n_blocks, effective_n_cols);
+        let (prover_data, root) = MerkleData::build(folded_matrix, n_blocks, n_blocks);
 
         prover_state.add_base_scalars(&root);
 

diff --git a/crates/whir/src/lib.rs b/crates/whir/src/lib.rs
@@ -27,6 +27,9 @@ pub(crate) use utils::*;
 mod matrix;
 pub(crate) use matrix::*;
 
+mod svo;
+pub(crate) use svo::*;
+
 #[derive(Clone, Debug)]
 pub struct SparseStatement<EF> {
     pub total_num_variables: usize,