From e9f2c264e4f5bcc0ada0ab6856020602747e9979 Mon Sep 17 00:00:00 2001 From: vincent Date: Thu, 5 Feb 2026 15:17:58 -0500 Subject: [PATCH 1/4] fix: implement retry logic for opening DHT records with backoff --- src/backend.rs | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/backend.rs b/src/backend.rs index f45db69..8051948 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -349,12 +349,35 @@ impl Backend { keys.public_key.clone().into_value(), keys.secret_key.clone().unwrap().into_value(), ); - let dht_record = routing_context - .open_dht_record( - record_key.clone(), - Some(KeyPair::new(CRYPTO_KIND_VLD0, bare_keypair)), - ) - .await?; + let keypair = Some(KeyPair::new(CRYPTO_KIND_VLD0, bare_keypair)); + let mut dht_record = None; + let max_retries = 6; + let mut retries = max_retries; + + while retries > 0 { + retries -= 1; + match routing_context + .open_dht_record(record_key.clone(), keypair.clone()) + .await + { + Ok(record) => { + dht_record = Some(record); + break; + } + Err(e) => { + warn!("Failed to open group DHT record: {e}. Retries left: {retries}"); + if retries == 0 { + return Err(anyhow!( + "Unable to open group DHT record after {max_retries} attempts: {e}" + )); + } + } + } + let backoff_ms = 500 * (max_retries - retries) as u64; + tokio::time::sleep(std::time::Duration::from_millis(backoff_ms)).await; + } + + let dht_record = dht_record.ok_or_else(|| anyhow!("Group DHT record retrieval failed"))?; let mut group = Group::new( dht_record.clone(), From c78fb9be5eaa971dd719fe2902221d518605360e Mon Sep 17 00:00:00 2001 From: vincent Date: Thu, 12 Feb 2026 00:19:48 -0500 Subject: [PATCH 2/4] fix: store keypair after join and improve download timeouts --- src/backend.rs | 12 ++++++++++++ src/group.rs | 32 +++++++++++++++++++++++--------- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/backend.rs b/src/backend.rs index 8051948..67e5721 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -379,6 +379,18 @@ impl Backend { let dht_record = dht_record.ok_or_else(|| anyhow!("Group DHT record retrieval failed"))?; + // Persist the keypair so refresh_group/get_group can reload it later + let protected_store = veilid.protected_store().unwrap(); + CommonKeypair { + id: record_key.clone(), + public_key: keys.public_key.clone(), + secret_key: keys.secret_key.clone(), + encryption_key: keys.encryption_key.clone(), + } + .store_keypair(&protected_store) + .await + .map_err(|e| anyhow!(e))?; + let mut group = Group::new( dht_record.clone(), keys.encryption_key.clone(), diff --git a/src/group.rs b/src/group.rs index fa85cab..dd5462b 100644 --- a/src/group.rs +++ b/src/group.rs @@ -138,9 +138,13 @@ impl Group { } // Retry configuration - const MAX_RETRIES: u32 = 3; + // Veilid route establishment + iroh tunnel setup can be transiently flaky, especially + // under load in CI or when routes are regenerating. Keep this bounded but resilient. + // Keep this bounded: higher-level callers (HTTP endpoints/tests) can retry too. + const MAX_RETRIES: u32 = 5; const INITIAL_DELAY_MS: u64 = 500; - const MAX_DELAY_MS: u64 = 2000; + const MAX_DELAY_MS: u64 = 4000; + const PER_PEER_TIMEOUT_SECS: u64 = 10; for attempt in 0..MAX_RETRIES { for repo in repos.iter() { @@ -153,26 +157,36 @@ impl Group { if let Ok(route_id_blob) = repo.get_route_id_blob().await { // It's faster to try and fail, than to ask then try - let result = self - .iroh_blobs - .download_file_from(route_id_blob, hash) - .await; - + // Guard against hung downloads so a single peer doesn't stall the whole request. + let result = tokio::time::timeout( + tokio::time::Duration::from_secs(PER_PEER_TIMEOUT_SECS), + self.iroh_blobs.download_file_from(route_id_blob, hash), + ) + .await; + match result { - Ok(()) => { + Ok(Ok(())) => { info!("Successfully downloaded hash {} from peer {}", hash.to_hex(), hex::encode(repo.id().opaque().ref_value()) ); return Ok(()); } - Err(e) => { + Ok(Err(e)) => { warn!( "Unable to download from peer {}: {}", hex::encode(repo.id().opaque().ref_value()), e ); } + Err(_) => { + warn!( + "Timed out downloading hash {} from peer {} after {}s", + hash.to_hex(), + hex::encode(repo.id().opaque().ref_value()), + PER_PEER_TIMEOUT_SECS + ); + } } } else { warn!( From 14431707acac2396e3be3b52d0136dd763e60f13 Mon Sep 17 00:00:00 2001 From: vincent Date: Thu, 12 Feb 2026 00:37:17 -0500 Subject: [PATCH 3/4] ci: pin rust to 1.88.0 to fix keyvaluedb-sqlite build --- .github/workflows/lint_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint_and_test.yml b/.github/workflows/lint_and_test.yml index 5021fab..67afc5a 100644 --- a/.github/workflows/lint_and_test.yml +++ b/.github/workflows/lint_and_test.yml @@ -11,7 +11,7 @@ jobs: - name: Set up Rust toolchain uses: hecrj/setup-rust-action@v2 with: - rust-version: stable + rust-version: 1.88.0 - name: Check out the code uses: actions/checkout@v4 From 9615dcd580047d8b51e39b910efb74f119bf5635 Mon Sep 17 00:00:00 2001 From: vincent Date: Thu, 12 Feb 2026 00:53:21 -0500 Subject: [PATCH 4/4] fix: pin keyvaluedb 0.1.6 to fix CI build --- .github/workflows/lint_and_test.yml | 2 +- Cargo.toml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lint_and_test.yml b/.github/workflows/lint_and_test.yml index 67afc5a..5021fab 100644 --- a/.github/workflows/lint_and_test.yml +++ b/.github/workflows/lint_and_test.yml @@ -11,7 +11,7 @@ jobs: - name: Set up Rust toolchain uses: hecrj/setup-rust-action@v2 with: - rust-version: 1.88.0 + rust-version: stable - name: Check out the code uses: actions/checkout@v4 diff --git a/Cargo.toml b/Cargo.toml index 934159a..961c030 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,11 +8,14 @@ iroh = "0.24.0" iroh-blobs = "0.24.0" # Pin to avoid keyvaluedb-sqlite 0.1.6 + rusqlite 0.38 break (u64 FromSql not implemented) keyvaluedb-sqlite = "=0.1.5" +keyvaluedb = "=0.1.6" # pin to prevent 0.1.7 which breaks keyvaluedb-sqlite 0.1.5 rusqlite = "=0.37.0" veilid-core = { git = "https://gitlab.com/veilid/veilid.git", tag = "v0.5.1" } +veilid-tools = { git = "https://gitlab.com/veilid/veilid.git", tag = "v0.5.1" } # v0.3.0 matches Veilid 0.5.1 API veilid-iroh-blobs = { git = "https://github.com/RangerMauve/veilid-iroh-blobs", tag = "v0.3.0" } tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] } xdg = "2.4" tmpdir = "1" serde = "1.0.204" @@ -35,3 +38,4 @@ base64 = "0.22.1" # Temporary patch for Veilid fanout queue underflow bug [patch."https://gitlab.com/veilid/veilid.git"] veilid-core = { git = "https://gitlab.com/tripledoublev/veilid.git", branch = "fix-underflow" } +veilid-tools = { git = "https://gitlab.com/tripledoublev/veilid.git", branch = "fix-underflow" }