Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bin/debug-trace-server/src/data_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ impl From<CodeFetchError> for DataProviderError {
fn from(e: CodeFetchError) -> Self {
match e {
CodeFetchError::Deadline(d) => d.into(),
CodeFetchError::BytecodeUnavailable { .. } |
CodeFetchError::VerificationFailure { .. } => eyre::eyre!("{e}").into(),
}
}
Expand Down
16 changes: 13 additions & 3 deletions bin/stateless-validator/src/chain_sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,21 @@ impl BlockProcessor for ValidatorProcessor {
metrics::on_contract_cache_read(contracts.len() as u64, missing_contracts.len() as u64);

if !missing_contracts.is_empty() {
// The unbounded `get_codes` retries transport errors forever, so the only way this
// errors is `VerificationFailure` — a deterministic bad upstream / bad witness.
// The `Deadline` variant can't surface here because no deadline is passed.
// `get_codes` absorbs both transport errors (unbounded RPC retry) and the transient
// "upstream returned empty bytes for a non-empty hash" condition (in-place retry up
// to `bytecode_unavailable_retry_budget`). Anything that surfaces here is fatal:
//
// - `BytecodeUnavailable` — the retry budget elapsed without upstream catching up.
// Halt: a longer outage is operator-visible and shouldn't be hidden behind silent
// cycle restarts.
// - `VerificationFailure` — non-empty bytes whose keccak doesn't match. Deterministic
// bad upstream / bad witness; halt.
// - `Deadline` can't surface because no deadline is passed to `get_codes`.
let fetched =
self.rpc_client.get_codes(&missing_contracts, true).await.map_err(|e| match e {
stateless_common::CodeFetchError::BytecodeUnavailable { .. } => {
fail(format!("Contract bytecode unavailable: {e}"), false)
}
stateless_common::CodeFetchError::VerificationFailure { .. } => {
fail(format!("Contract verification failed: {e}"), false)
}
Expand Down
127 changes: 109 additions & 18 deletions crates/stateless-common/src/rpc_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,12 @@ pub struct RpcClientConfig {
/// jitter) before the next round and doubles the sleep each round up to `max`. Retry is
/// unbounded — caller-visible failures don't occur.
pub rpc_retry: BackoffPolicy,
/// Total time budget for retrying [`CodeFetchError::BytecodeUnavailable`] inside
/// [`RpcClient::get_codes_with_deadline`]. Empty bytecode for a non-empty codehash
/// (mega-reth's `unwrap_or_default()` on a missing-from-state lookup) is retried with the
/// `rpc_retry` schedule until this budget elapses, then surfaces as a fatal error so the
/// validator halts instead of looping forever.
pub bytecode_unavailable_retry_budget: Duration,
}

impl Default for RpcClientConfig {
Expand All @@ -133,6 +139,10 @@ impl Default for RpcClientConfig {
// 30s. That's gentle enough to ride out near-tip witness generation latency (a
// few seconds) without hammering upstream when something is genuinely broken.
rpc_retry: BackoffPolicy::new(Duration::from_millis(500), Duration::from_secs(30)),
// 5 minutes covers a typical rpc-node restart / cold-start window. Beyond that,
// halt rather than spin — a longer outage is operator-visible and shouldn't be
// hidden behind silent retries.
bytecode_unavailable_retry_budget: Duration::from_secs(300),
}
}
}
Expand All @@ -145,6 +155,7 @@ impl std::fmt::Debug for RpcClientConfig {
.field("data_max_concurrent_requests", &self.data_max_concurrent_requests)
.field("witness_max_concurrent_requests", &self.witness_max_concurrent_requests)
.field("rpc_retry", &self.rpc_retry)
.field("bytecode_unavailable_retry_budget", &self.bytecode_unavailable_retry_budget)
.finish()
}
}
Expand Down Expand Up @@ -191,13 +202,26 @@ pub struct SetValidatedBlocksResponse {

/// Errors returned by [`RpcClient::get_codes`] / [`RpcClient::get_codes_with_deadline`].
///
/// - `VerificationFailure` is deterministic (upstream returned bytecode whose keccak does not match
/// the requested hash): validators should halt rather than retry.
/// - `Deadline` only surfaces from [`RpcClient::get_codes_with_deadline`] when the caller- supplied
/// All variants are caller-visible failures the validator should halt on. The transient
/// "upstream not synced yet" condition is absorbed inside `get_codes_with_deadline` by an
/// internal retry loop bounded by `RpcClientConfig::bytecode_unavailable_retry_budget`; only
/// when that budget is exhausted does it escalate to `BytecodeUnavailable`.
///
/// - `BytecodeUnavailable` — upstream returned **empty** bytecode for a non-empty codehash
/// (mega-reth's `eth_getCodeByHash` does `unwrap_or_default()` when its state DB doesn't have the
/// code yet, e.g. during rpc-node restart) and the retry budget elapsed without recovery. Halt: a
/// longer outage is operator-visible.
/// - `VerificationFailure` — upstream returned **non-empty** bytecode whose keccak does not match
/// the requested hash. Deterministic divergence (bad upstream / bad witness); halt.
/// - `Deadline` only surfaces from [`RpcClient::get_codes_with_deadline`] when the caller-supplied
/// deadline elapsed before every per-hash fetch completed. The unbounded [`RpcClient::get_codes`]
/// never produces this variant.
#[derive(Debug, thiserror::Error)]
pub enum CodeFetchError {
#[error(
"RPC provider returned empty bytecode for non-empty codehash {requested:?} (upstream likely not synced)"
)]
BytecodeUnavailable { requested: B256 },
#[error(
"RPC provider returned bytecode with unexpected codehash: expected {requested:?}, got {actual:?}"
)]
Expand Down Expand Up @@ -615,30 +639,65 @@ impl RpcClient {

/// Deadline-aware counterpart of [`Self::get_codes`].
///
/// The same deadline is applied to every per-hash fetch; if it fires during any fetch
/// the whole batch aborts with `Err(CodeFetchError::Deadline(..))`. With `deadline = None`
/// the per-hash retries are unbounded and this function can only fail with
/// `VerificationFailure`.
/// Transport errors are retried unboundedly inside `get_code_with_deadline` (or until the
/// caller-supplied `deadline` fires, surfacing `CodeFetchError::Deadline`). Empty-bytecode
/// responses (upstream not synced yet) are retried in-place with the `rpc_retry` backoff
/// schedule, capped by `RpcClientConfig::bytecode_unavailable_retry_budget`; once the
/// budget elapses the call returns `BytecodeUnavailable` and the validator halts.
/// `VerificationFailure` is never retried — non-empty wrong bytes is a real divergence.
pub async fn get_codes_with_deadline(
&self,
hashes: &[B256],
verify: bool,
deadline: Option<Instant>,
) -> std::result::Result<HashMap<B256, Arc<Bytecode>>, CodeFetchError> {
let backoff = self.config.rpc_retry.clone();
let unavailable_budget = self.config.bytecode_unavailable_retry_budget;

// `try_join_all` cancels the remaining per-hash futures on the first error — a
// `VerificationFailure` or `Deadline` on one hash stops the rest of the batch
// immediately and releases their concurrency permits, so a slow straggler can't
// hold permits until its own per-attempt timeout fires.
future::try_join_all(hashes.iter().map(|&hash| async move {
let bytes = self.get_code_with_deadline(hash, deadline).await?;
let code = Bytecode::new_raw(bytes);
if verify {
let actual = code.hash_slow();
if actual != hash {
return Err(CodeFetchError::VerificationFailure { requested: hash, actual });
// `BytecodeUnavailable` / `VerificationFailure` / `Deadline` on one hash stops the
// rest of the batch immediately and releases their concurrency permits, so a slow
// straggler can't hold permits until its own per-attempt timeout fires.
future::try_join_all(hashes.iter().map(|&hash| {
let backoff = backoff.clone();
async move {
// Per-hash budget for the empty-bytes retry loop. Honor the tighter of the
// caller-supplied deadline (if any) and the configured budget.
let unavailable_deadline = {
let budget_end = Instant::now() + unavailable_budget;
match deadline {
Some(d) => d.min(budget_end),
None => budget_end,
}
};
let mut sleep = backoff.initial;

loop {
let bytes = self.get_code_with_deadline(hash, deadline).await?;
let code = Bytecode::new_raw(bytes.clone());
if !verify {
return Ok::<_, CodeFetchError>((hash, Arc::new(code)));
}
let actual = code.hash_slow();
if actual == hash {
return Ok((hash, Arc::new(code)));
}
if !bytes.is_empty() {
// Non-empty wrong bytes = real divergence; halt without retry.
return Err(CodeFetchError::VerificationFailure { requested: hash, actual });
}
// Empty bytes for a non-empty hash = mega-reth's `unwrap_or_default()` on a
// missing-from-state-DB lookup. Retry with backoff until the budget is up,
// then surface as a fatal error so the validator halts rather than spins.
let now = Instant::now();
if now >= unavailable_deadline {
return Err(CodeFetchError::BytecodeUnavailable { requested: hash });
}
let wait = sleep.min(unavailable_deadline.saturating_duration_since(now));
tokio::time::sleep(wait).await;
sleep = sleep.saturating_mul(2).min(backoff.max);
}
}
Ok::<_, CodeFetchError>((hash, Arc::new(code)))
}))
.await
.map(|pairs| pairs.into_iter().collect())
Expand Down Expand Up @@ -1583,6 +1642,38 @@ mod tests {
handle.stop().unwrap();
}

/// `get_codes(verify=true)` retries empty-bytecode responses (upstream not synced yet) with
/// the `rpc_retry` schedule and surfaces `BytecodeUnavailable` once the configured budget is
/// exhausted. Non-empty wrong bytes is a separate `VerificationFailure` covered elsewhere.
/// Here we shrink the budget to a few milliseconds so the assertion fires quickly.
#[tokio::test]
async fn test_get_codes_verify_empty_bytecode_signals_unavailable() {
// Empty `codes` map → `start_code_rpc` falls back to `Bytes::from_static(&[])` for any
// hash, mimicking mega-reth's `eth_getCodeByHash` behavior when the code isn't in state.
let (handle, url) = start_code_rpc(HashMap::new()).await;
let config = RpcClientConfig {
rpc_retry: BackoffPolicy::new(Duration::from_millis(1), Duration::from_millis(2)),
bytecode_unavailable_retry_budget: Duration::from_millis(10),
..Default::default()
};
let client =
RpcClient::new_with_config(&[url.as_str()], &[url.as_str()], config, None).unwrap();

let some_hash = B256::from([0xAA; 32]);
let err = client
.get_codes(&[some_hash], true)
.await
.expect_err("empty bytes for non-empty hash must fail after retry budget");
match err {
CodeFetchError::BytecodeUnavailable { requested } => {
assert_eq!(requested, some_hash);
}
other => panic!("expected BytecodeUnavailable, got {other:?}"),
}

handle.stop().unwrap();
}

/// All-good batch: verified entries are returned wrapped in `Arc<Bytecode>` (the type
/// change that lets `ContractCache::insert` avoid a second allocation at each call site).
#[tokio::test]
Expand Down
Loading