From e57d2a7871bd58a7d4f0a8bf201e6f532e8cee18 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 7 May 2026 14:32:23 -0300 Subject: [PATCH] chore: add actor-level retry for infra errors --- CHANGELOG.md | 1 + Cargo.lock | 28 ++- Cargo.toml | 1 + crates/ntx-builder/Cargo.toml | 1 + crates/ntx-builder/src/actor/execute.rs | 140 +++++++++++ crates/ntx-builder/src/actor/mod.rs | 315 +++++++++++++++++++++--- crates/ntx-builder/src/lib.rs | 30 +++ 7 files changed, 477 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59a710e02..6b6dcf7a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - Replaced blocking-in-async operations in the validator, remote prover, and ntx-builder with `spawn_blocking` to avoid starving the Tokio runtime ([#2041](https://github.com/0xMiden/node/pull/2041)). - Implement persistent RocksDB backend for `AccountStateForest`, improving startup time ([#2020](https://github.com/0xMiden/node/pull/2020)). +- Fixed network transaction builder permanently dropping notes after transient infrastructure failures. These now retry with exponential backoff at the actor level instead of consuming per-note retry budget ([#2052](https://github.com/0xMiden/node/issues/2052)). ## v0.14.10 (2026-05-29) diff --git a/Cargo.lock b/Cargo.lock index 6dc505da8..ffd022b55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -688,6 +688,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "backon" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" +dependencies = [ + "fastrand", + "gloo-timers", + "tokio", +] + [[package]] name = "backtrace" version = "0.3.76" @@ -1928,6 +1939,18 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "gloo-timers" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] + [[package]] name = "governor" version = "0.10.4" @@ -3242,6 +3265,7 @@ name = "miden-node-ntx-builder" version = "0.14.10" dependencies = [ "anyhow", + "backon", "build-rs", "diesel", "diesel_migrations", @@ -4784,7 +4808,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools 0.13.0", + "itertools 0.14.0", "log", "multimap", "petgraph 0.8.3", @@ -4805,7 +4829,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.117", diff --git a/Cargo.toml b/Cargo.toml index e2416f9e8..e836f04ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,7 @@ miden-crypto = { version = "0.23" } anyhow = { version = "1.0" } assert_matches = { version = "1.5" } async-trait = { version = "0.1" } +backon = { version = "1.6" } build-rs = { version = "0.3" } clap = { features = ["derive"], version = "4.5" } deadpool = { default-features = false, version = "0.12" } diff --git a/crates/ntx-builder/Cargo.toml b/crates/ntx-builder/Cargo.toml index 629776da1..b015d762f 100644 --- a/crates/ntx-builder/Cargo.toml +++ b/crates/ntx-builder/Cargo.toml @@ -18,6 +18,7 @@ doctest = false [dependencies] anyhow = { workspace = true } +backon = { workspace = true } diesel = { features = ["numeric", "sqlite"], workspace = true } diesel_migrations = { features = ["sqlite"], workspace = true } futures = { workspace = true } diff --git a/crates/ntx-builder/src/actor/execute.rs b/crates/ntx-builder/src/actor/execute.rs index 2b465adbd..ad61b5d5c 100644 --- a/crates/ntx-builder/src/actor/execute.rs +++ b/crates/ntx-builder/src/actor/execute.rs @@ -72,6 +72,66 @@ pub enum NtxError { Submission(#[source] tonic::Status), } +/// Classifies an [`NtxError`] as caused by infrastructure (transient: node, prover, or transport +/// problem) or intrinsic to the transaction batch. +/// +/// Infrastructure failures must not consume per-note retry budget, intrinsic failures must. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ErrorKind { + /// Transient infrastructure failure: prover unreachable, validator/block-producer down, + /// transport error, or our own checker erroring on a store fetch. The note batch is not + /// to blame, we need to retry without penalising notes. + Infrastructure, + /// The note batch itself is the problem: consumability check rejected notes, the executor or + /// local prover failed on this specific batch, or the validator/block-producer rejected the + /// transaction content. Penalise the notes per the existing retry policy. + Intrinsic, +} + +impl NtxError { + /// Returns whether this error is caused by infrastructure or by the transaction batch. + pub fn kind(&self) -> ErrorKind { + match self { + Self::AllNotesFailed(_) | Self::Execution(_) => ErrorKind::Intrinsic, + Self::NoteFilter(_) | Self::InputNotes(_) => ErrorKind::Infrastructure, + Self::Proving(err) => match err { + // The remote prover client wraps every transport / connection / deserialization + // failure in `TransactionProverError::Other`. + TransactionProverError::Other { .. } => ErrorKind::Infrastructure, + TransactionProverError::AccountDeltaApplyFailed(_) + | TransactionProverError::RemoveFeeAssetFromDelta(_) + | TransactionProverError::TransactionOutputConstructionFailed(_) + | TransactionProverError::OutputNoteShrinkFailed(_) + | TransactionProverError::ProvenTransactionBuildFailed(_) + | TransactionProverError::TransactionProgramExecutionFailed(_) => { + ErrorKind::Intrinsic + }, + }, + // gRPC status codes split into transport / server-side hiccups (Infrastructure) + // versus content-rejection codes (Intrinsic). + Self::Submission(status) => match status.code() { + tonic::Code::Unavailable + | tonic::Code::DeadlineExceeded + | tonic::Code::Cancelled + | tonic::Code::Aborted + | tonic::Code::Unknown + | tonic::Code::Internal + | tonic::Code::ResourceExhausted => ErrorKind::Infrastructure, + tonic::Code::InvalidArgument + | tonic::Code::FailedPrecondition + | tonic::Code::OutOfRange + | tonic::Code::NotFound + | tonic::Code::AlreadyExists + | tonic::Code::Unauthenticated + | tonic::Code::PermissionDenied + | tonic::Code::Unimplemented + | tonic::Code::DataLoss + | tonic::Code::Ok => ErrorKind::Intrinsic, + }, + } + } +} + type NtxResult = Result; /// The result of a successful transaction execution. @@ -679,3 +739,83 @@ impl StorageSlotRegistry { slots.get(&(account_id, map_root)).cloned() } } + +#[cfg(test)] +mod tests { + use miden_protocol::errors::TransactionInputError; + use miden_tx::{NoteCheckerError, TransactionExecutorError, TransactionProverError}; + + use super::{ErrorKind, NtxError}; + + #[test] + fn error_kind_matrix() { + // Submission: tonic codes mapping to Infrastructure. + for (code, ctor) in [ + ("unavailable", tonic::Status::unavailable as fn(&'static str) -> tonic::Status), + ("deadline_exceeded", tonic::Status::deadline_exceeded), + ("cancelled", tonic::Status::cancelled), + ("aborted", tonic::Status::aborted), + ("unknown", tonic::Status::unknown), + ("internal", tonic::Status::internal), + ("resource_exhausted", tonic::Status::resource_exhausted), + ] { + assert_eq!( + NtxError::Submission(ctor("test")).kind(), + ErrorKind::Infrastructure, + "expected Submission({code}) to be Infrastructure", + ); + } + + // Submission: tonic codes mapping to Intrinsic. + for (code, ctor) in [ + ( + "invalid_argument", + tonic::Status::invalid_argument as fn(&'static str) -> tonic::Status, + ), + ("failed_precondition", tonic::Status::failed_precondition), + ("out_of_range", tonic::Status::out_of_range), + ("not_found", tonic::Status::not_found), + ("already_exists", tonic::Status::already_exists), + ("unauthenticated", tonic::Status::unauthenticated), + ("permission_denied", tonic::Status::permission_denied), + ("unimplemented", tonic::Status::unimplemented), + ("data_loss", tonic::Status::data_loss), + ] { + assert_eq!( + NtxError::Submission(ctor("test")).kind(), + ErrorKind::Intrinsic, + "expected Submission({code}) to be Intrinsic", + ); + } + + // Proving: only the catch-all `Other` variant + assert_eq!( + NtxError::Proving(TransactionProverError::other("remote prover unreachable")).kind(), + ErrorKind::Infrastructure, + ); + assert_eq!( + NtxError::Proving(TransactionProverError::other_with_source( + "wrapped", + std::io::Error::other("boom"), + )) + .kind(), + ErrorKind::Infrastructure, + ); + + // The remaining failure modes are batch-intrinsic. + assert_eq!(NtxError::AllNotesFailed(Vec::new()).kind(), ErrorKind::Intrinsic); + assert_eq!( + NtxError::Execution(TransactionExecutorError::FeeAssetMustBeFungible).kind(), + ErrorKind::Intrinsic, + ); + + assert_eq!( + NtxError::NoteFilter(NoteCheckerError::InputNoteCountOutOfRange(0)).kind(), + ErrorKind::Infrastructure, + ); + assert_eq!( + NtxError::InputNotes(TransactionInputError::TooManyInputNotes(usize::MAX)).kind(), + ErrorKind::Infrastructure, + ); + } +} diff --git a/crates/ntx-builder/src/actor/mod.rs b/crates/ntx-builder/src/actor/mod.rs index fa49ccccf..e07c68c8f 100644 --- a/crates/ntx-builder/src/actor/mod.rs +++ b/crates/ntx-builder/src/actor/mod.rs @@ -6,13 +6,14 @@ use std::sync::Arc; use std::time::Duration; use anyhow::Context; +use backon::{BackoffBuilder, ExponentialBackoff, ExponentialBuilder}; use candidate::TransactionCandidate; use futures::FutureExt; use miden_node_proto::domain::account::NetworkAccountId; use miden_node_utils::ErrorReport; use miden_node_utils::lru_cache::LruCache; use miden_protocol::Word; -use miden_protocol::account::{Account, AccountDelta}; +use miden_protocol::account::{Account, AccountDelta, AccountId}; use miden_protocol::block::BlockNumber; use miden_protocol::note::{NoteScript, Nullifier}; use miden_protocol::transaction::TransactionId; @@ -22,9 +23,11 @@ use tokio::sync::{Notify, Semaphore, mpsc}; use tokio_util::sync::CancellationToken; use crate::NoteError; +use crate::actor::execute::ErrorKind; use crate::chain_state::{ChainState, SharedChainState}; use crate::clients::{BlockProducerClient, StoreClient, ValidatorClient}; use crate::db::Db; +use crate::inflight_note::InflightNetworkNote; // ACTOR REQUESTS // ================================================================================================ @@ -77,6 +80,11 @@ pub struct AccountActorContext { pub request_tx: mpsc::Sender, /// Maximum number of VM execution cycles for network transactions. pub max_cycles: u32, + /// Initial actor-level sleep after an infrastructure-classified failure. Doubles on each + /// consecutive infra failure up to [`Self::infra_failure_backoff_max`] and resets on success. + pub infra_failure_backoff_initial: Duration, + /// Upper bound on the actor-level infra-failure backoff sleep. + pub infra_failure_backoff_max: Duration, } #[cfg(test)] @@ -112,6 +120,8 @@ impl AccountActorContext { db: db.clone(), request_tx, max_cycles: 1 << 18, + infra_failure_backoff_initial: Duration::from_millis(1), + infra_failure_backoff_max: Duration::from_millis(10), } } } @@ -166,6 +176,25 @@ enum ActorMode { TransactionInflight(TransactionId), } +/// Outcome of an `execute_transactions` call. +/// +/// Distinguishes infrastructure failures (caller should sleep with backoff and retry the same +/// notes without penalising them) from intrinsic failures (notes were marked failed and the +/// caller should idle until something changes) and successful submission. +#[derive(Debug)] +enum ExecutionOutcome { + /// Transaction was submitted, the actor should wait for mempool confirmation. + Inflight(TransactionId), + /// The transaction batch is intrinsically bad (notes failed consumability, executor or local + /// prover rejected the witness, validator/block-producer rejected the content). Notes have + /// already been marked failed and the actor should idle. + IntrinsicFailure, + /// An infrastructure-level component failed (prover unreachable, validator/block-producer + /// transport error, our own checker erroring). Notes were *not* penalised and the actor should + /// sleep for the configured backoff and retry the same candidate. + InfrastructureFailure, +} + // ACCOUNT ACTOR // ================================================================================================ @@ -221,6 +250,15 @@ pub struct AccountActor { request_tx: mpsc::Sender, /// Maximum number of VM execution cycles for network transactions. max_cycles: u32, + /// Initial sleep after an infrastructure-classified failure. Used to rebuild the backoff + /// iterator on success. + infra_failure_backoff_initial: Duration, + /// Upper bound on the actor-level infra-failure backoff sleep. + infra_failure_backoff_max: Duration, + /// Exponential backoff applied at the actor level after consecutive infrastructure-classified + /// failures. Rebuilt on every successful submission / intrinsic failure so the next infra + /// outage starts from `infra_failure_backoff_initial` again. + infra_backoff: ExponentialBackoff, } impl AccountActor { @@ -248,6 +286,12 @@ impl AccountActor { idle_timeout: actor_context.idle_timeout, request_tx: actor_context.request_tx.clone(), max_cycles: actor_context.max_cycles, + infra_failure_backoff_initial: actor_context.infra_failure_backoff_initial, + infra_failure_backoff_max: actor_context.infra_failure_backoff_max, + infra_backoff: build_infra_backoff( + actor_context.infra_failure_backoff_initial, + actor_context.infra_failure_backoff_max, + ), } } @@ -327,7 +371,29 @@ impl AccountActor { ).await?; if let Some(tx_candidate) = tx_candidate { - self.execute_transactions(account_id, tx_candidate).await; + match self.execute_transactions(account_id, tx_candidate).await { + ExecutionOutcome::Inflight(tx_id) => { + self.reset_infra_backoff(); + self.mode = ActorMode::TransactionInflight(tx_id); + }, + ExecutionOutcome::IntrinsicFailure => { + self.reset_infra_backoff(); + self.mode = ActorMode::NoViableNotes; + }, + ExecutionOutcome::InfrastructureFailure => { + let sleep = self + .infra_backoff + .next() + .unwrap_or(self.infra_failure_backoff_max); + tracing::warn!( + %account_id, + sleep_ms = sleep.as_millis() as u64, + "sleeping after infrastructure failure before retrying", + ); + tokio::time::sleep(sleep).await; + self.mode = ActorMode::NotesAvailable; + }, + } } else { // No transactions to execute, wait for events. self.mode = ActorMode::NoViableNotes; @@ -378,13 +444,15 @@ impl AccountActor { /// Execute a transaction candidate and mark notes as failed as required. /// - /// Updates the state of the actor based on the execution result. + /// Returns an [`ExecutionOutcome`] which the caller maps to the next [`ActorMode`]. + /// Infrastructure failures do *not* mark notes as failed and request the caller to sleep + /// before retrying the same candidate. #[tracing::instrument(name = "ntx.actor.execute_transactions", skip(self, tx_candidate))] async fn execute_transactions( &mut self, account_id: NetworkAccountId, tx_candidate: TransactionCandidate, - ) { + ) -> ExecutionOutcome { let block_num = tx_candidate.chain_tip_header.block_num(); // Execute the selected transaction. @@ -422,44 +490,28 @@ impl AccountActor { let failed_notes = log_failed_notes(failed); self.mark_notes_failed(&failed_notes, block_num).await; } - self.mode = ActorMode::TransactionInflight(tx_id); + ExecutionOutcome::Inflight(tx_id) }, - // Transaction execution failed. - Err(err) => { - let error_msg = err.as_report(); - tracing::error!( - %account_id, - ?note_ids, - err = %error_msg, - "network transaction failed", - ); - self.mode = ActorMode::NoViableNotes; - - // For `AllNotesFailed`, use the per-note errors which contain the - // specific reason each note failed (e.g. consumability check details). - let failed_notes: Vec<_> = match err { - execute::NtxError::AllNotesFailed(per_note) => log_failed_notes(per_note), - other => { - let error: NoteError = Arc::new(other); - notes - .iter() - .map(|note| { - tracing::info!( - note.id = %note.to_inner().as_note().id(), - nullifier = %note.nullifier(), - err = %error_msg, - "note failed: transaction execution error", - ); - (note.nullifier(), error.clone()) - }) - .collect() - }, - }; - self.mark_notes_failed(&failed_notes, block_num).await; + Err(err) => match classify_failure(account_id, ¬es, err) { + FailureOutcome::Infrastructure => ExecutionOutcome::InfrastructureFailure, + FailureOutcome::Intrinsic(failed_notes) => { + if !failed_notes.is_empty() { + self.mark_notes_failed(&failed_notes, block_num).await; + } + ExecutionOutcome::IntrinsicFailure + }, }, } } + /// Rebuilds the actor-local infra-failure backoff iterator so the next infra outage starts + /// from `infra_failure_backoff_initial`. Called after any successful submission or any + /// intrinsic failure. + fn reset_infra_backoff(&mut self) { + self.infra_backoff = + build_infra_backoff(self.infra_failure_backoff_initial, self.infra_failure_backoff_max); + } + /// Sends requests to the coordinator to cache note scripts fetched from the remote store. async fn cache_note_scripts(&self, scripts: Vec<(Word, NoteScript)>) { for (script_root, script) in scripts { @@ -516,3 +568,192 @@ fn log_failed_notes(failed: Vec) -> Vec<(Nullifier, NoteError)> { }) .collect() } + +/// What the actor should do with a failed [`execute::NtxError`]. +#[derive(Debug)] +enum FailureOutcome { + /// An infrastructure-level component failed (prover unreachable, validator/block-producer + /// transport error, our own checker erroring). Notes are *not* penalised and caller should + /// sleep for the configured backoff and retry the same candidate. + Infrastructure, + /// The transaction batch is intrinsically bad (notes failed consumability, executor or local + /// prover rejected the witness, validator/block-producer rejected the content). Caller should + /// mark the carried notes failed and idle. + Intrinsic(Vec<(Nullifier, NoteError)>), +} + +/// Decides what to do with a failed [`execute::NtxError`]: which notes to mark failed, and the +/// resulting [`FailureOutcome`]. +/// +/// - Infrastructure errors return `Infrastructure`: caller sleeps and retries. +/// - Intrinsic `AllNotesFailed` returns per-note errors carried in the variant. +/// - Any other intrinsic variant attributes the wrapped batch-level error to every note in the +/// batch. +fn classify_failure( + account_id: AccountId, + notes: &[InflightNetworkNote], + err: execute::NtxError, +) -> FailureOutcome { + let error_msg = err.as_report(); + let note_ids: Vec<_> = notes.iter().map(|n| n.to_inner().as_note().id()).collect(); + match err.kind() { + ErrorKind::Infrastructure => { + tracing::warn!( + %account_id, + ?note_ids, + err = %error_msg, + "network transaction failed due to infrastructure error; notes not penalised, \ + will retry after backoff", + ); + FailureOutcome::Infrastructure + }, + ErrorKind::Intrinsic => { + tracing::error!( + %account_id, + ?note_ids, + err = %error_msg, + "network transaction failed", + ); + let failed_notes: Vec<_> = match err { + execute::NtxError::AllNotesFailed(per_note) => log_failed_notes(per_note), + other => { + let error: NoteError = Arc::new(other); + notes + .iter() + .map(|note| { + tracing::info!( + note.id = %note.to_inner().as_note().id(), + nullifier = %note.nullifier(), + err = %error_msg, + "note failed: transaction execution error", + ); + (note.nullifier(), error.clone()) + }) + .collect() + }, + }; + FailureOutcome::Intrinsic(failed_notes) + }, + } +} + +/// Builds the [`ExponentialBackoff`] used at the actor level after infrastructure-classified +/// failures. +fn build_infra_backoff(initial: Duration, max: Duration) -> ExponentialBackoff { + ExponentialBuilder::default() + .with_min_delay(initial) + .with_max_delay(max) + .with_factor(2.0) + .without_max_times() + .with_jitter() + .build() +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use miden_tx::{TransactionExecutorError, TransactionProverError}; + + use super::{FailureOutcome, build_infra_backoff, classify_failure, execute}; + use crate::inflight_note::InflightNetworkNote; + use crate::test_utils::{mock_network_account_id, mock_single_target_note}; + + #[test] + fn infra_backoff_is_bounded_and_unbounded_in_length() { + let initial = Duration::from_secs(1); + let max = Duration::from_secs(30); + let upper_with_jitter = max.saturating_mul(2); + let mut backoff = build_infra_backoff(initial, max); + + for _ in 0..50 { + let delay = backoff.next().expect("backoff should be unbounded"); + assert!( + delay <= upper_with_jitter, + "delay {delay:?} exceeds {upper_with_jitter:?} (max + jitter)", + ); + } + } + + /// Returns 3 distinct mock notes targeting the same account. + fn mock_notes() -> Vec { + let account = mock_network_account_id(); + (0u8..3) + .map(|seed| InflightNetworkNote::new(mock_single_target_note(account, seed))) + .collect() + } + + fn mock_account_id() -> miden_protocol::account::AccountId { + mock_network_account_id().inner() + } + + /// Infrastructure errors return `Infrastructure`, no notes are penalised. + #[test] + fn classify_failure_infra_skips_marking_notes() { + let notes = mock_notes(); + let cases: Vec = vec![ + execute::NtxError::Submission(tonic::Status::unavailable("bp down")), + execute::NtxError::Submission(tonic::Status::deadline_exceeded("timeout")), + execute::NtxError::Submission(tonic::Status::internal("internal")), + execute::NtxError::Proving(TransactionProverError::other("remote prover unreachable")), + execute::NtxError::NoteFilter(miden_tx::NoteCheckerError::InputNoteCountOutOfRange(0)), + execute::NtxError::InputNotes( + miden_protocol::errors::TransactionInputError::TooManyInputNotes(usize::MAX), + ), + ]; + for err in cases { + let display = format!("{err:?}"); + let outcome = classify_failure(mock_account_id(), ¬es, err); + assert!( + matches!(outcome, FailureOutcome::Infrastructure), + "expected Infrastructure for `{display}`, got {outcome:?}", + ); + } + } + + /// `Submission` with a content-rejection code (`InvalidArgument`) is intrinsic, every note + /// in the batch is marked failed with the same wrapped error. + #[test] + fn classify_failure_submission_invalid_argument_marks_all_notes() { + let notes = mock_notes(); + let err = execute::NtxError::Submission(tonic::Status::invalid_argument("bad tx")); + let outcome = classify_failure(mock_account_id(), ¬es, err); + let FailureOutcome::Intrinsic(failed) = outcome else { + panic!("expected Intrinsic, got {outcome:?}"); + }; + assert_eq!(failed.len(), notes.len(), "all notes should be marked failed"); + for note in ¬es { + assert!( + failed.iter().any(|(n, _)| *n == note.nullifier()), + "missing nullifier {} in failed list", + note.nullifier(), + ); + } + } + + /// A structured (non-`Other`) `Execution` variant is intrinsic, the local execution rejected + /// the batch, so all notes are marked failed. + #[test] + fn classify_failure_local_execution_marks_all_notes() { + let notes = mock_notes(); + let err = execute::NtxError::Execution(TransactionExecutorError::FeeAssetMustBeFungible); + let outcome = classify_failure(mock_account_id(), ¬es, err); + let FailureOutcome::Intrinsic(failed) = outcome else { + panic!("expected Intrinsic, got {outcome:?}"); + }; + assert_eq!(failed.len(), notes.len()); + } + + /// `AllNotesFailed` carries its own per-note attribution, so an empty per-note vec produces + /// no DB updates while still being intrinsic. + #[test] + fn classify_failure_all_notes_failed_uses_per_note_attribution() { + let notes = mock_notes(); + let err = execute::NtxError::AllNotesFailed(Vec::new()); + let outcome = classify_failure(mock_account_id(), ¬es, err); + let FailureOutcome::Intrinsic(failed) = outcome else { + panic!("expected Intrinsic, got {outcome:?}"); + }; + assert!(failed.is_empty()); + } +} diff --git a/crates/ntx-builder/src/lib.rs b/crates/ntx-builder/src/lib.rs index 4eb8c6386..db19e336a 100644 --- a/crates/ntx-builder/src/lib.rs +++ b/crates/ntx-builder/src/lib.rs @@ -67,6 +67,15 @@ const DEFAULT_IDLE_TIMEOUT: Duration = Duration::from_secs(5 * 60); /// Default maximum number of crashes an account actor is allowed before being deactivated. const DEFAULT_MAX_ACCOUNT_CRASHES: usize = 10; +/// Default initial sleep applied at the actor level after an infrastructure-classified failure +/// (downed prover, transport error, validator/block-producer crash). Doubles on each consecutive +/// infra failure up to [`DEFAULT_INFRA_FAILURE_BACKOFF_MAX`]. Resets on the next successful +/// transaction. +const DEFAULT_INFRA_FAILURE_BACKOFF_INITIAL: Duration = Duration::from_secs(1); + +/// Default upper bound on the actor-level infra-failure backoff sleep. +const DEFAULT_INFRA_FAILURE_BACKOFF_MAX: Duration = Duration::from_secs(30); + /// Default maximum number of VM execution cycles allowed for a network transaction. /// /// This limits the computational cost of network transactions. The protocol maximum is @@ -131,6 +140,15 @@ pub struct NtxBuilderConfig { /// Defaults to 2^18 cycles. pub max_cycles: u32, + /// Initial actor-level sleep after an infrastructure-classified failure (e.g. prover + /// unreachable, validator/block-producer crash, transport error). Doubles on each consecutive + /// infra failure up to [`Self::infra_failure_backoff_max`] and resets on success. Per-note + /// `attempt_count` is *not* advanced for infra failures. + pub infra_failure_backoff_initial: Duration, + + /// Upper bound on the actor-level infra-failure backoff sleep. + pub infra_failure_backoff_max: Duration, + /// Path to the SQLite database file used for persistent state. pub database_filepath: PathBuf, } @@ -156,6 +174,8 @@ impl NtxBuilderConfig { idle_timeout: DEFAULT_IDLE_TIMEOUT, max_account_crashes: DEFAULT_MAX_ACCOUNT_CRASHES, max_cycles: DEFAULT_MAX_TX_CYCLES, + infra_failure_backoff_initial: DEFAULT_INFRA_FAILURE_BACKOFF_INITIAL, + infra_failure_backoff_max: DEFAULT_INFRA_FAILURE_BACKOFF_MAX, database_filepath, } } @@ -244,6 +264,14 @@ impl NtxBuilderConfig { self } + /// Sets the actor-level infra-failure backoff bounds (initial sleep and cap). + #[must_use] + pub fn with_infra_failure_backoff(mut self, initial: Duration, max: Duration) -> Self { + self.infra_failure_backoff_initial = initial; + self.infra_failure_backoff_max = max; + self + } + /// Builds and initializes the network transaction builder. /// /// This method connects to the store and block producer services, fetches the current @@ -307,6 +335,8 @@ impl NtxBuilderConfig { db: db.clone(), request_tx, max_cycles: self.max_cycles, + infra_failure_backoff_initial: self.infra_failure_backoff_initial, + infra_failure_backoff_max: self.infra_failure_backoff_max, }; Ok(NetworkTransactionBuilder::new(