From 815354e4118c55d3a5257c58afc5c05a2936bfa8 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Thu, 23 Jan 2025 20:28:43 +0000 Subject: [PATCH 1/8] background task --- dev-tools/omdb/src/bin/omdb/nexus.rs | 69 ++ dev-tools/omdb/tests/env.out | 12 + dev-tools/omdb/tests/successes.out | 62 ++ dev-tools/omdb/tests/usage_errors.out | 22 +- nexus-config/src/nexus_config.rs | 16 + nexus/Cargo.toml | 4 + nexus/db-model/src/schema.rs | 1 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/sled.rs | 10 + nexus/db-queries/src/db/datastore/dataset.rs | 1 + nexus/db-queries/src/db/datastore/mod.rs | 5 + .../src/db/datastore/physical_disk.rs | 2 + nexus/db-queries/src/db/datastore/rack.rs | 2 + nexus/db-queries/src/db/datastore/sled.rs | 3 + .../src/db/datastore/support_bundle.rs | 1 + nexus/db-queries/src/db/datastore/update.rs | 23 +- nexus/db-queries/src/db/datastore/vpc.rs | 1 + nexus/examples/config-second.toml | 1 + nexus/examples/config.toml | 1 + nexus/reconfigurator/execution/src/lib.rs | 1 + nexus/src/app/background/init.rs | 26 +- .../background/tasks/blueprint_execution.rs | 2 + .../background/tasks/inventory_collection.rs | 1 + nexus/src/app/background/tasks/mod.rs | 1 + .../tasks/tuf_artifact_replication.rs | 941 ++++++++++++++++++ nexus/src/app/mod.rs | 15 + nexus/src/app/sled.rs | 1 + nexus/src/app/update/mod.rs | 21 +- nexus/test-utils/src/background.rs | 103 ++ nexus/tests/config.test.toml | 4 + nexus/tests/integration_tests/rack.rs | 2 + nexus/tests/integration_tests/updates.rs | 127 ++- nexus/tests/tuf-replication/delete.txt | 8 + nexus/tests/tuf-replication/new_sled.txt | 30 + nexus/tests/tuf-replication/recopy.txt | 3 + .../tuf-replication/simple_replicate.txt | 90 ++ nexus/types/src/deployment/planning_input.rs | 8 + nexus/types/src/internal_api/background.rs | 115 +++ nexus/types/src/internal_api/params.rs | 3 + openapi/nexus-internal.json | 7 + schema/crdb/dbinit.sql | 13 +- schema/crdb/tuf-artifact-replication/up01.sql | 7 + schema/crdb/tuf-artifact-replication/up02.sql | 2 + sled-agent/src/artifact_store.rs | 56 +- sled-agent/src/nexus.rs | 4 + sled-agent/src/sim/artifact_store.rs | 71 +- sled-agent/src/sim/server.rs | 2 + sled-agent/src/sim/sled_agent.rs | 11 +- sled-agent/src/sled_agent.rs | 9 +- smf/nexus/multi-sled/config-partial.toml | 1 + smf/nexus/single-sled/config-partial.toml | 1 + tufaceous/manifests/fake.toml | 4 +- .../src/artifacts/extracted_artifacts.rs | 24 +- 53 files changed, 1816 insertions(+), 137 deletions(-) create mode 100644 nexus/src/app/background/tasks/tuf_artifact_replication.rs create mode 100644 nexus/tests/tuf-replication/delete.txt create mode 100644 nexus/tests/tuf-replication/new_sled.txt create mode 100644 nexus/tests/tuf-replication/recopy.txt create mode 100644 nexus/tests/tuf-replication/simple_replicate.txt create mode 100644 schema/crdb/tuf-artifact-replication/up01.sql create mode 100644 schema/crdb/tuf-artifact-replication/up02.sql diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index d11f18a321d..0a01cecc912 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -51,6 +51,9 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; +use nexus_types::internal_api::background::TufArtifactReplicationCounters; +use nexus_types::internal_api::background::TufArtifactReplicationRequest; +use nexus_types::internal_api::background::TufArtifactReplicationStatus; use nexus_types::inventory::BaseboardId; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::CollectionUuid; @@ -943,6 +946,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { "service_firewall_rule_propagation" => { print_task_service_firewall_rule_propagation(details); } + "tuf_artifact_replication" => { + print_task_tuf_artifact_replication(details); + } _ => { println!( "warning: unknown background task: {:?} \ @@ -2024,6 +2030,69 @@ fn print_task_service_firewall_rule_propagation(details: &serde_json::Value) { }; } +fn print_task_tuf_artifact_replication(details: &serde_json::Value) { + fn print_counters(counters: TufArtifactReplicationCounters) { + const ROWS: &[&str] = &[ + "list ok:", + "list err:", + "put ok:", + "put err:", + "copy ok:", + "copy err:", + "delete ok:", + "delete err:", + ]; + const WIDTH: usize = const_max_len(ROWS); + + for (label, value) in ROWS.iter().zip([ + counters.list_ok, + counters.list_err, + counters.put_ok, + counters.put_err, + counters.copy_ok, + counters.copy_err, + counters.delete_ok, + counters.delete_err, + ]) { + println!(" {label:3}"); + } + } + + match serde_json::from_value::( + details.clone(), + ) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(status) => { + println!(" request ringbuf:"); + for TufArtifactReplicationRequest { + time, + target_sled, + operation, + error, + } in status.request_debug_ringbuf.iter() + { + println!(" - target sled: {target_sled}"); + println!(" operation: {operation:?}"); + println!( + " at: {}", + time.to_rfc3339_opts(SecondsFormat::Secs, true) + ); + if let Some(error) = error { + println!(" error: {error}") + } + } + println!(" last run:"); + print_counters(status.last_run_counters); + println!(" lifetime:"); + print_counters(status.lifetime_counters); + println!(" local repos: {}", status.local_repos); + } + } +} + /// Summarizes an `ActivationReason` fn reason_str(reason: &ActivationReason) -> &'static str { match reason { diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 4aecca2c738..b650544f78b 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -175,6 +175,10 @@ task: "switch_port_config_manager" manages switch port settings for rack switches +task: "tuf_artifact_replication" + replicate update repo artifacts across sleds + + task: "v2p_manager" manages opte v2p mappings for vpc networking @@ -355,6 +359,10 @@ task: "switch_port_config_manager" manages switch port settings for rack switches +task: "tuf_artifact_replication" + replicate update repo artifacts across sleds + + task: "v2p_manager" manages opte v2p mappings for vpc networking @@ -522,6 +530,10 @@ task: "switch_port_config_manager" manages switch port settings for rack switches +task: "tuf_artifact_replication" + replicate update repo artifacts across sleds + + task: "v2p_manager" manages opte v2p mappings for vpc networking diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 09a4bb70c50..1c0a676cf8e 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -394,6 +394,10 @@ task: "switch_port_config_manager" manages switch port settings for rack switches +task: "tuf_artifact_replication" + replicate update repo artifacts across sleds + + task: "v2p_manager" manages opte v2p mappings for vpc networking @@ -724,6 +728,35 @@ task: "switch_port_config_manager" started at (s ago) and ran for ms warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {}) +task: "tuf_artifact_replication" + configured period: every h + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + request ringbuf: + - target sled: ..................... + operation: List + at: + last run: + list ok: 1 + list err: 0 + put ok: 0 + put err: 0 + copy ok: 0 + copy err: 0 + delete ok: 0 + delete err: 0 + lifetime: + list ok: 1 + list err: 0 + put ok: 0 + put err: 0 + copy ok: 0 + copy err: 0 + delete ok: 0 + delete err: 0 + local repos: 0 + task: "v2p_manager" configured period: every s currently executing: no @@ -1183,6 +1216,35 @@ task: "switch_port_config_manager" started at (s ago) and ran for ms warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {}) +task: "tuf_artifact_replication" + configured period: every h + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + request ringbuf: + - target sled: ..................... + operation: List + at: + last run: + list ok: 1 + list err: 0 + put ok: 0 + put err: 0 + copy ok: 0 + copy err: 0 + delete ok: 0 + delete err: 0 + lifetime: + list ok: 1 + list err: 0 + put ok: 0 + put err: 0 + copy ok: 0 + copy err: 0 + delete ok: 0 + delete err: 0 + local repos: 0 + task: "v2p_manager" configured period: every s currently executing: no diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 85fc7612893..f238d1dac18 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -315,17 +315,19 @@ Options: Show sleds that match the given filter Possible values: - - all: All sleds in the system, regardless of policy or state - - commissioned: All sleds that are currently part of the control plane cluster - - decommissioned: All sleds that were previously part of the control plane cluster - but have been decommissioned - - discretionary: Sleds that are eligible for discretionary services - - in-service: Sleds that are in service (even if they might not be eligible + - all: All sleds in the system, regardless of policy or state + - commissioned: All sleds that are currently part of the control plane cluster + - decommissioned: All sleds that were previously part of the control plane + cluster but have been decommissioned + - discretionary: Sleds that are eligible for discretionary services + - in-service: Sleds that are in service (even if they might not be eligible for discretionary services) - - query-during-inventory: Sleds whose sled agents should be queried for inventory - - reservation-create: Sleds on which reservations can be created - - vpc-routing: Sleds which should be sent OPTE V2P mappings and Routing rules - - vpc-firewall: Sleds which should be sent VPC firewall rules + - query-during-inventory: Sleds whose sled agents should be queried for inventory + - reservation-create: Sleds on which reservations can be created + - vpc-routing: Sleds which should be sent OPTE V2P mappings and Routing rules + - vpc-firewall: Sleds which should be sent VPC firewall rules + - tuf-artifact-replication: Sleds which should have TUF repo artifacts replicated onto + them --log-level log level filter diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index c92121dce5b..dfaa847851f 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -417,6 +417,8 @@ pub struct BackgroundTaskConfig { /// configuration for region snapshot replacement finisher task pub region_snapshot_replacement_finish: RegionSnapshotReplacementFinishConfig, + /// configuration for TUF artifact replication task + pub tuf_artifact_replication: TufArtifactReplicationConfig, } #[serde_as] @@ -722,6 +724,14 @@ pub struct RegionSnapshotReplacementFinishConfig { pub period_secs: Duration, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct TufArtifactReplicationConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -978,6 +988,7 @@ mod test { region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 + tuf_artifact_replication.period_secs = 300 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1174,6 +1185,10 @@ mod test { RegionSnapshotReplacementFinishConfig { period_secs: Duration::from_secs(30), }, + tuf_artifact_replication: + TufArtifactReplicationConfig { + period_secs: Duration::from_secs(300) + }, }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { @@ -1257,6 +1272,7 @@ mod test { region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 + tuf_artifact_replication.period_secs = 300 [default_region_allocation_strategy] type = "random" "##, diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index fbc86a42c26..9a02bd40c94 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -7,6 +7,10 @@ license = "MPL-2.0" [lints] workspace = true +[features] +# Set by omicron-package based on the target configuration. +rack-topology-single-sled = [] + [build-dependencies] omicron-rpaths.workspace = true diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index af9c09e4f21..7616315eaea 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -907,6 +907,7 @@ table! { sled_policy -> crate::sled_policy::SledPolicyEnum, sled_state -> crate::SledStateEnum, sled_agent_gen -> Int8, + repo_depot_port -> Int4, } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index c07f358194c..5344322bb18 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(120, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(121, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(121, "tuf-artifact-replication"), KnownVersion::new(120, "rendezvous-debug-dataset"), KnownVersion::new(119, "tuf-artifact-key-uuid"), KnownVersion::new(118, "support-bundles"), diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index b586ad0fc58..ff0a92282ac 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -81,6 +81,9 @@ pub struct Sled { /// This is specifically distinct from `rcgen`, which is incremented by /// child resources as part of `DatastoreCollectionConfig`. pub sled_agent_gen: Generation, + + // ServiceAddress (Repo Depot API). Uses `ip`. + pub repo_depot_port: SqlU16, } impl Sled { @@ -169,6 +172,7 @@ impl From for params::SledAgentInfo { }; Self { sa_address: sled.address(), + repo_depot_port: sled.repo_depot_port.into(), role, baseboard: Baseboard { serial: sled.serial_number.clone(), @@ -220,6 +224,9 @@ pub struct SledUpdate { pub ip: ipv6::Ipv6Addr, pub port: SqlU16, + // ServiceAddress (Repo Depot API). Uses `ip`. + pub repo_depot_port: SqlU16, + // Generation number - owned and incremented by sled-agent. pub sled_agent_gen: Generation, } @@ -228,6 +235,7 @@ impl SledUpdate { pub fn new( id: Uuid, addr: SocketAddrV6, + repo_depot_port: u16, baseboard: SledBaseboard, hardware: SledSystemHardware, rack_id: Uuid, @@ -247,6 +255,7 @@ impl SledUpdate { reservoir_size: hardware.reservoir_size, ip: addr.ip().into(), port: addr.port().into(), + repo_depot_port: repo_depot_port.into(), sled_agent_gen, } } @@ -282,6 +291,7 @@ impl SledUpdate { reservoir_size: self.reservoir_size, ip: self.ip, port: self.port, + repo_depot_port: self.repo_depot_port, last_used_address, sled_agent_gen: self.sled_agent_gen, } diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs index b61dec33047..d37e014b726 100644 --- a/nexus/db-queries/src/db/datastore/dataset.rs +++ b/nexus/db-queries/src/db/datastore/dataset.rs @@ -376,6 +376,7 @@ mod test { let sled = SledUpdate::new( *sled_id.as_untyped_uuid(), "[::1]:0".parse().unwrap(), + 0, SledBaseboard { serial_number: "test-sn".to_string(), part_number: "test-pn".to_string(), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index d14eb58ea6f..724489259df 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -473,6 +473,7 @@ mod test { use nexus_types::deployment::BlueprintTarget; use nexus_types::external_api::params; use nexus_types::silo::DEFAULT_SILO_ID; + use omicron_common::address::REPO_DEPOT_PORT; use omicron_common::api::external::{ ByteCount, Error, IdentityMetadataCreateParams, LookupType, Name, }; @@ -719,12 +720,14 @@ mod test { 0, 0, ); + let bogus_repo_depot_port = 8081; let rack_id = Uuid::new_v4(); let sled_id = SledUuid::new_v4(); let sled_update = SledUpdate::new( sled_id.into_untyped_uuid(), bogus_addr, + bogus_repo_depot_port, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id, @@ -1727,6 +1730,7 @@ mod test { let sled1 = db::model::SledUpdate::new( sled1_id, addr1, + REPO_DEPOT_PORT, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id, @@ -1739,6 +1743,7 @@ mod test { let sled2 = db::model::SledUpdate::new( sled2_id, addr2, + REPO_DEPOT_PORT, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id, diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index a2499070c47..1d1c6286a6d 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -344,10 +344,12 @@ mod test { async fn create_test_sled(db: &DataStore) -> Sled { let sled_id = Uuid::new_v4(); let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); + let repo_depot_port = 0; let rack_id = Uuid::new_v4(); let sled_update = SledUpdate::new( sled_id, addr, + repo_depot_port, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id, diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index c8f6deb2ed9..492d03b4c1e 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -1237,9 +1237,11 @@ mod test { async fn create_test_sled(db: &DataStore, sled_id: Uuid) -> Sled { let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); + let repo_depot_port = 0; let sled_update = SledUpdate::new( sled_id, addr, + repo_depot_port, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id(), diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index e4a9c82395a..87d5c324c63 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -72,6 +72,7 @@ impl DataStore { dsl::time_modified.eq(now), dsl::ip.eq(sled_update.ip), dsl::port.eq(sled_update.port), + dsl::repo_depot_port.eq(sled_update.repo_depot_port), dsl::rack_id.eq(sled_update.rack_id), dsl::is_scrimlet.eq(sled_update.is_scrimlet()), dsl::usable_hardware_threads @@ -1491,9 +1492,11 @@ pub(in crate::db::datastore) mod test { pub(crate) fn test_new_sled_update() -> SledUpdate { let sled_id = Uuid::new_v4(); let addr = SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0); + let repo_depot_port = 0; SledUpdate::new( sled_id, addr, + repo_depot_port, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id(), diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 3ca4cdae6df..1aae03a52f6 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -570,6 +570,7 @@ mod test { let sled = SledUpdate::new( *self.sled.as_untyped_uuid(), "[::1]:0".parse().unwrap(), + 0, SledBaseboard { serial_number: format!( "test-{}", diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index f1a62d53d2a..8abddc8ada0 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -12,18 +12,20 @@ use crate::context::OpContext; use crate::db; use crate::db::error::{public_error_from_diesel, ErrorHandler}; use crate::db::model::SemverVersion; +use crate::db::pagination::paginated; use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use diesel::result::Error as DieselError; use nexus_db_model::{ArtifactHash, TufArtifact, TufRepo, TufRepoDescription}; use omicron_common::api::external::{ - self, CreateResult, LookupResult, LookupType, ResourceType, - TufRepoInsertStatus, + self, CreateResult, DataPageParams, ListResultVec, LookupResult, + LookupType, ResourceType, TufRepoInsertStatus, }; use omicron_uuid_kinds::TufRepoKind; use omicron_uuid_kinds::TypedUuid; use swrite::{swrite, SWrite}; +use uuid::Uuid; /// The return value of [`DataStore::update_tuf_repo_insert`]. /// @@ -140,6 +142,23 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; Ok(TufRepoDescription { repo, artifacts }) } + + /// Returns the list of all TUF repo artifacts known to the system. + pub async fn update_tuf_artifact_list( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + + use db::schema::tuf_artifact::dsl; + + paginated(dsl::tuf_artifact, dsl::id, pagparams) + .select(TufArtifact::as_select()) + .load_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } // This is a separate method mostly to make rustfmt not bail out on long lines diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 13cc1754964..e70e49e4df8 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -3059,6 +3059,7 @@ mod tests { .sled_upsert(SledUpdate::new( sled_id.into_untyped_uuid(), "[::1]:0".parse().unwrap(), + 0, sled_baseboard_for_test(), sled_system_hardware_for_test(), rack_id, diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index e129975d841..a423ae25b22 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -141,6 +141,7 @@ region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 +tuf_artifact_replication.period_secs = 300 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 2ba7b570985..972ae2f0140 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -127,6 +127,7 @@ region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 +tuf_artifact_replication.period_secs = 300 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 6d8af9dd53d..49f3a60b443 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -770,6 +770,7 @@ mod tests { let sled = SledUpdate::new( sled_id.into_untyped_uuid(), "[::1]:0".parse().unwrap(), + 0, SledBaseboard { serial_number: format!("test-{sled_id}"), part_number: "test-sled".to_string(), diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index d44b20cfcd0..cda5ccd997b 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -119,6 +119,7 @@ use super::tasks::service_firewall_rules; use super::tasks::support_bundle_collector; use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; use super::tasks::sync_switch_configuration::SwitchPortSettingsManager; +use super::tasks::tuf_artifact_replication; use super::tasks::v2p_mappings::V2PManager; use super::tasks::vpc_routes; use super::Activator; @@ -135,7 +136,9 @@ use omicron_uuid_kinds::OmicronZoneUuid; use oximeter::types::ProducerRegistry; use std::collections::BTreeMap; use std::sync::Arc; +use tokio::sync::mpsc; use tokio::sync::watch; +use update_common::artifacts::ArtifactsWithPlan; use uuid::Uuid; /// Interface for activating various background tasks and read data that they @@ -176,6 +179,7 @@ pub struct BackgroundTasks { pub task_region_snapshot_replacement_garbage_collection: Activator, pub task_region_snapshot_replacement_step: Activator, pub task_region_snapshot_replacement_finish: Activator, + pub task_tuf_artifact_replication: Activator, // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as @@ -265,6 +269,7 @@ impl BackgroundTasksInitializer { ), task_region_snapshot_replacement_step: Activator::new(), task_region_snapshot_replacement_finish: Activator::new(), + task_tuf_artifact_replication: Activator::new(), task_internal_dns_propagation: Activator::new(), task_external_dns_propagation: Activator::new(), @@ -333,6 +338,7 @@ impl BackgroundTasksInitializer { task_region_snapshot_replacement_garbage_collection, task_region_snapshot_replacement_step, task_region_snapshot_replacement_finish, + task_tuf_artifact_replication, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. @@ -868,13 +874,29 @@ impl BackgroundTasksInitializer { done", period: config.region_snapshot_replacement_finish.period_secs, task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( - datastore, sagas, + datastore.clone(), + sagas, )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_finish, }); + driver.register(TaskDefinition { + name: "tuf_artifact_replication", + description: "replicate update repo artifacts across sleds", + period: config.tuf_artifact_replication.period_secs, + task_impl: Box::new( + tuf_artifact_replication::ArtifactReplication::new( + datastore.clone(), + args.tuf_artifact_replication_rx, + ), + ), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_tuf_artifact_replication, + }); + driver } } @@ -899,6 +921,8 @@ pub struct BackgroundTasksData { pub producer_registry: ProducerRegistry, /// Helpers for saga recovery pub saga_recovery: saga_recovery::SagaRecoveryHelpers>, + /// Channel for TUF repository artifacts to be replicated out to sleds + pub tuf_artifact_replication_rx: mpsc::Receiver, } /// Starts the three DNS-propagation-related background tasks for either diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index 98991a0af1e..33aff8b8a18 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -321,9 +321,11 @@ mod test { let SocketAddr::V6(addr) = server.addr() else { panic!("Expected Ipv6 address. Got {}", server.addr()); }; + let bogus_repo_depot_port = 0; let update = SledUpdate::new( sled_id.into_untyped_uuid(), addr, + bogus_repo_depot_port, SledBaseboard { serial_number: i.to_string(), part_number: "test".into(), diff --git a/nexus/src/app/background/tasks/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs index 1d2aba2c368..54adfa617a9 100644 --- a/nexus/src/app/background/tasks/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -400,6 +400,7 @@ mod test { let sled = SledUpdate::new( Uuid::new_v4(), SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1200 + i, 0, 0), + 1200 + i, SledBaseboard { serial_number: format!("serial-{}", i), part_number: String::from("fake-sled"), diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 2eaa66a3247..add3e47241a 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -36,5 +36,6 @@ pub mod service_firewall_rules; pub mod support_bundle_collector; pub mod sync_service_zone_nat; pub mod sync_switch_configuration; +pub mod tuf_artifact_replication; pub mod v2p_mappings; pub mod vpc_routes; diff --git a/nexus/src/app/background/tasks/tuf_artifact_replication.rs b/nexus/src/app/background/tasks/tuf_artifact_replication.rs new file mode 100644 index 00000000000..aae41bbdae9 --- /dev/null +++ b/nexus/src/app/background/tasks/tuf_artifact_replication.rs @@ -0,0 +1,941 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! TUF Repo Depot: Artifact replication across sleds (RFD 424) +//! +//! `Nexus::updates_put_repository` accepts a TUF repository, which Nexus +//! unpacks, verifies, and reasons about the artifacts in. This uses temporary +//! storage within the Nexus zone. After that, the update artifacts have to +//! go somewhere else. We've settled for now on "everywhere": a copy of each +//! artifact is stored on each sled's M.2 devices. +//! +//! This background task is responsible for getting locally-stored artifacts +//! onto sleds, and ensuring all sleds have copies of all artifacts. +//! `Nexus::updates_put_repository` sends the [`ArtifactsWithPlan`] object to +//! this task via an [`mpsc`] channel and activates it. Once enough sleds have a +//! copy of each artifact in an `ArtifactsWithPlan`, the local copy is removed. +//! +//! # Task flow +//! +//! 1. The task moves `ArtifactsWithPlan` objects off the `mpsc` channel and +//! into a `Vec` that represents the set of artifacts stored locally in this +//! Nexus zone. +//! 2. The task fetches the list of artifacts from CockroachDB, and queries +//! the list of artifacts stored on each sled. Sled artifact storage is +//! content-addressed by SHA-256 checksum. Errors querying a sled are +//! logged but otherwise ignored: the task proceeds as if that sled has no +//! artifacts. (This means that the task will always be trying to replicate +//! artifacts to that sled until it comes back or is pulled out of service.) +//! 3. The task builds a directory of all artifacts and where they can be found +//! (local `ArtifactsWithPlan` and/or sled agents). +//! 4. If all the artifacts belonging to an `ArtifactsWithPlan` object have +//! been replicated to at least `MIN_SLED_REPLICATION` sleds, the task drops +//! the object from its `Vec` (thus cleaning up the local storage of those +//! files). +//! 5. The task generates a list of requests that need to be sent: +//! - PUT each locally-stored artifact not present on any sleds to +//! `MIN_SLED_REPLICATION` random sleds. +//! - For each partially-replicated artifact, choose a sled that is missing +//! the artifact, and tell it (via `artifact_copy_from_depot`) to fetch the +//! artifact from a random sled that has it. +//! - DELETE all artifacts no longer tracked in CockroachDB from all sleds +//! that have that artifact. +//! +//! # Rate limits +//! +//! To avoid doing everything at once, Nexus limits both concurrency +//! and the rate of its requests. Concurrency is controlled by +//! `MAX_REQUEST_CONCURRENCY`, which is intended to reduce bandwidth spikes for +//! PUT requests. +//! +//! TODO: In addition to Nexus concurrency rate limits, we should also rate +//! limit requests per second sent by Nexus, as well as limit the number of +//! ongoing copy requests being processed at once by Sled Agent. + +use std::collections::{BTreeMap, VecDeque}; +use std::future::Future; +use std::str::FromStr; +use std::sync::Arc; + +use anyhow::Result; +use chrono::Utc; +use futures::future::BoxFuture; +use futures::{FutureExt, Stream, StreamExt}; +use nexus_auth::context::OpContext; +use nexus_db_queries::db::{ + datastore::SQL_BATCH_SIZE, pagination::Paginator, DataStore, +}; +use nexus_networking::sled_client_from_address; +use nexus_types::deployment::SledFilter; +use nexus_types::internal_api::background::{ + TufArtifactReplicationCounters, TufArtifactReplicationOperation, + TufArtifactReplicationRequest, TufArtifactReplicationStatus, +}; +use omicron_common::update::ArtifactHash; +use omicron_uuid_kinds::{GenericUuid, SledUuid}; +use rand::seq::SliceRandom; +use serde_json::json; +use slog_error_chain::InlineErrorChain; +use tokio::sync::mpsc::error::TryRecvError; +use tokio::sync::{mpsc, OwnedSemaphorePermit, Semaphore}; +use update_common::artifacts::{ + ArtifactsWithPlan, ExtractedArtifactDataHandle, +}; + +use crate::app::background::BackgroundTask; + +// The maximum number of requests to sleds to run at once. This is intended +// to reduce bandwidth spikes for PUT requests; other requests should return +// quickly. +const MAX_REQUEST_CONCURRENCY: usize = 8; +// The number of sleds that artifacts must be present on before the local copy +// of artifacts is dropped. This is ignored if there are fewer than this many +// sleds in the system. +#[cfg(not(feature = "rack-topology-single-sled"))] +const MIN_SLED_REPLICATION: usize = 3; +#[cfg(feature = "rack-topology-single-sled")] +const MIN_SLED_REPLICATION: usize = 1; +// The number of copies of an artifact we expect each sled to have. This is currently 2 because +// sleds store a copy of each artifact on each M.2 device. +const EXPECTED_COUNT: u32 = 2; +// How many recent requests to remember for debugging purposes. At 32 sleds +// and 64 artifacts per repo, this is enough to remember at least the most +// recent two repositories being replicated or deleted unless requests need to +// be retried. +const MAX_REQUEST_DEBUG_BUFFER_LEN: usize = 32 * 64 * 2; + +#[derive(Debug)] +struct Sled { + id: SledUuid, + client: sled_agent_client::Client, + depot_base_url: String, +} + +#[derive(Debug)] +struct ArtifactPresence { + /// The count of this artifact stored on each sled. + sleds: BTreeMap, + /// Handle to the artifact's local storage if present. + local: Option, + /// An artifact is wanted if it is listed in the `tuf_artifact` table. + wanted: bool, +} + +/// Wrapper enum for `ExtractedArtifactDataHandle` so that we don't need to +/// create and unpack TUF repos in unit tests. In `cfg(not(test))` this is +/// equivalent to a wrapper struct. +#[derive(Debug, Clone)] +enum ArtifactHandle { + Extracted(ExtractedArtifactDataHandle), + #[cfg(test)] + Fake, +} + +impl ArtifactHandle { + async fn file(&self) -> anyhow::Result { + match self { + ArtifactHandle::Extracted(handle) => handle.file().await, + #[cfg(test)] + ArtifactHandle::Fake => unimplemented!(), + } + } +} + +#[derive(Debug, Default)] +struct Inventory(BTreeMap); + +impl Inventory { + fn into_requests<'a>( + self, + sleds: &'a [Sled], + rng: &mut impl rand::Rng, + ) -> Requests<'a> { + let mut requests = Requests::default(); + for (hash, presence) in self.0 { + if presence.wanted { + let (sleds_present, mut sleds_not_present) = + sleds.iter().partition::, _>(|sled| { + presence + .sleds + .get(&sled.id) + .copied() + .unwrap_or_default() + > 0 + }); + sleds_not_present.shuffle(rng); + + // If we have a local copy, PUT the artifact to more sleds until we + // meet `MIN_SLED_REPLICATION`. + let mut sled_puts = Vec::new(); + if let Some(handle) = presence.local { + let count = MIN_SLED_REPLICATION + .saturating_sub(sleds_present.len()); + for _ in 0..count { + let Some(sled) = sleds_not_present.pop() else { + break; + }; + requests.put.push(Request::Put { + sled, + handle: handle.clone(), + hash, + }); + sled_puts.push(sled); + } + } + + // Tell each remaining sled missing the artifact to fetch it + // from a random sled that has it. + for target_sled in sleds_not_present { + if let Some(source_sled) = + sleds_present.choose(rng).copied() + { + requests.other.push(Request::CopyFromDepot { + target_sled, + source_sled, + hash, + }); + } else { + // There are no sleds that currently have the artifact, + // but we might have PUT requests going out. Choose one + // of the sleds we are PUTting this artifact onto and + // schedule it after all PUT requests are done. + if let Some(source_sled) = + sled_puts.choose(rng).copied() + { + requests.copy_after_put.push( + Request::CopyFromDepot { + target_sled, + source_sled, + hash, + }, + ); + } + } + } + + // If there are sleds with 0 < n < `EXPECTED_COUNT` copies, tell + // them to also fetch it from a random other sled. + for target_sled in sleds { + let Some(count) = presence.sleds.get(&target_sled.id) + else { + continue; + }; + if *count > 0 && *count < EXPECTED_COUNT { + let Ok(source_sled) = sleds_present + .choose_weighted(rng, |sled| { + if sled.id == target_sled.id { + 0 + } else { + 1 + } + }) + .copied() + else { + break; + }; + requests.recopy.push(Request::CopyFromDepot { + target_sled, + source_sled, + hash, + }) + } + } + } else { + // We don't want this artifact to be stored anymore, so tell all + // sleds that have it to DELETE it. + for sled in sleds { + if presence.sleds.contains_key(&sled.id) { + requests.other.push(Request::Delete { sled, hash }); + } + } + } + } + + requests.put.shuffle(rng); + requests.copy_after_put.shuffle(rng); + requests.recopy.shuffle(rng); + requests.other.shuffle(rng); + requests + } +} + +#[derive(Debug, Default)] +struct Requests<'a> { + /// PUT requests are always highest priority. + put: Vec>, + /// Copy requests are standard priority, but these copy requests depend on + /// PUTs and need to be delayed until all PUTs are complete. + copy_after_put: Vec>, + /// Requests to copy artifacts onto sleds that already have a copy, but not + /// `EXPECTED_COUNT` copies, are lowest priority. + recopy: Vec>, + /// Everything else; standard priority. + other: Vec>, +} + +impl<'a> Requests<'a> { + fn into_stream( + self, + log: &'a slog::Logger, + ) -> impl Stream< + Item = impl Future + use<'a>, + > + use<'a> { + // Create a `Semaphore` with `self.put.len()` permits. Each PUT + // request is assigned a permit immediately, which we drop after + // `Request::execute` returns. Before yielding any requests in + // `copy_after_put` we acquire `self.put.len()` permits, thus waiting + // until all PUT requests have completed. + let put_len = self.put.len(); + let semaphore = Arc::new(Semaphore::new(put_len)); + let mut put_permits = Vec::new(); + for _ in 0..put_len { + // This cannot fail: the semaphore is not closed and there are + // `put_len` permits available. + let permit = Arc::clone(&semaphore) + .try_acquire_owned() + .expect("semaphore should have an available permit"); + put_permits.push(permit); + } + + let put = futures::stream::iter(self.put.into_iter().zip(put_permits)) + .map(|(request, permit)| request.execute(log, Some(permit))); + let other = + futures::stream::iter(self.other.into_iter().chain(self.recopy)) + .map(|request| request.execute(log, None)); + + let copy_after_put = async move { + // There's an awkward mix of usize and u32 in the `Semaphore` + // API. If `put_len` somehow exceeds u32, don't run any of the + // `copy_after_put` requests; we'll pick them up next time this + // background task runs. (Otherwise we would run billions(?!) of + // copy requests early.) + let iter = if let Ok(put_len) = u32::try_from(put_len) { + // Wait for all PUT requests to complete by acquiring `put_len` + // permits from `semaphore`. + let _permit = semaphore + .acquire_many(put_len) + .await + .expect("semaphore should not be closed"); + self.copy_after_put + } else { + Vec::new() + }; + futures::stream::iter(iter) + .map(|request| request.execute(log, None)) + } + .flatten_stream(); + + put.chain(futures::stream::select(copy_after_put, other)) + } +} + +#[derive(Debug)] +enum Request<'a> { + Put { + sled: &'a Sled, + handle: ArtifactHandle, + hash: ArtifactHash, + }, + CopyFromDepot { + target_sled: &'a Sled, + source_sled: &'a Sled, + hash: ArtifactHash, + }, + Delete { + sled: &'a Sled, + hash: ArtifactHash, + }, +} + +impl Request<'_> { + async fn execute( + self, + log: &slog::Logger, + _permit: Option, + ) -> TufArtifactReplicationRequest { + let err: Option> = async { + match &self { + Request::Put { sled, handle, hash } => { + sled.client + .artifact_put(&hash.to_string(), handle.file().await?) + .await?; + } + Request::CopyFromDepot { target_sled, source_sled, hash } => { + target_sled + .client + .artifact_copy_from_depot( + &hash.to_string(), + &sled_agent_client::types::ArtifactCopyFromDepotBody { + depot_base_url: source_sled.depot_base_url.clone(), + }, + ) + .await?; + } + Request::Delete { sled, hash } => { + sled.client.artifact_delete(&hash.to_string()).await?; + } + }; + Ok(()) + } + .await + .err(); + + let time = Utc::now(); + let (target_sled, hash) = match &self { + Request::Put { sled, hash, .. } + | Request::CopyFromDepot { target_sled: sled, hash, .. } + | Request::Delete { sled, hash } => (sled, hash), + }; + let msg = match (&self, err.is_some()) { + (Request::Put { .. }, true) => "Failed to put artifact", + (Request::Put { .. }, false) => "Successfully put artifact", + (Request::CopyFromDepot { .. }, true) => { + "Failed to request artifact copy from depot" + } + (Request::CopyFromDepot { .. }, false) => { + "Successfully requested artifact copy from depot" + } + (Request::Delete { .. }, true) => "Failed to delete artifact", + (Request::Delete { .. }, false) => "Successfully deleted artifact", + }; + if let Some(ref err) = err { + slog::warn!( + log, + "{msg}"; + "error" => InlineErrorChain::new(err.as_ref()), + "sled" => target_sled.client.baseurl(), + "sha256" => &hash.to_string(), + ); + } else { + slog::info!( + log, + "{msg}"; + "sled" => target_sled.client.baseurl(), + "sha256" => &hash.to_string(), + ); + } + + TufArtifactReplicationRequest { + time, + target_sled: target_sled.id, + operation: match self { + Request::Put { hash, .. } => { + TufArtifactReplicationOperation::Put { hash } + } + Request::CopyFromDepot { hash, source_sled, .. } => { + TufArtifactReplicationOperation::Copy { + hash, + source_sled: source_sled.id, + } + } + Request::Delete { hash, .. } => { + TufArtifactReplicationOperation::Delete { hash } + } + }, + error: err.map(|err| err.to_string()), + } + } +} + +pub struct ArtifactReplication { + datastore: Arc, + local: Vec, + local_rx: mpsc::Receiver, + /// List of recent requests for debugging. + request_debug_ringbuf: Arc>, + lifetime_counters: TufArtifactReplicationCounters, +} + +impl BackgroundTask for ArtifactReplication { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + // Move any received artifacts from `local_rx` to `local`. + loop { + match self.local_rx.try_recv() { + Ok(artifacts) => self.local.push(artifacts), + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Disconnected) => { + let err = "artifact replication receiver disconnected"; + error!(&opctx.log, "{err}"); + return json!({"error": err}); + } + } + } + + let sleds = match self.list_sleds(opctx).await { + Ok(sleds) => sleds, + Err(err) => return json!({"error": format!("{err:#}")}), + }; + let mut counters = TufArtifactReplicationCounters::default(); + let mut inventory = + match self.list_artifacts_from_database(opctx).await { + Ok(inventory) => inventory, + Err(err) => return json!({"error": format!("{err:#}")}), + }; + self.list_artifacts_on_sleds( + opctx, + &sleds, + &mut inventory, + &mut counters, + ) + .await; + self.list_and_clean_up_local_artifacts(&mut inventory); + + let requests = + inventory.into_requests(&sleds, &mut rand::thread_rng()); + let completed = requests + .into_stream(&opctx.log) + .buffer_unordered(MAX_REQUEST_CONCURRENCY) + .collect::>() + .await; + self.insert_debug_requests(completed, &mut counters); + + self.lifetime_counters += counters; + serde_json::to_value(TufArtifactReplicationStatus { + last_run_counters: counters, + lifetime_counters: self.lifetime_counters, + request_debug_ringbuf: self.request_debug_ringbuf.clone(), + local_repos: self.local.len(), + }) + .unwrap() + } + .boxed() + } +} + +impl ArtifactReplication { + pub fn new( + datastore: Arc, + local_rx: mpsc::Receiver, + ) -> ArtifactReplication { + ArtifactReplication { + datastore, + local: Vec::new(), + local_rx, + request_debug_ringbuf: Arc::new(VecDeque::new()), + lifetime_counters: TufArtifactReplicationCounters::default(), + } + } + + fn insert_debug_requests( + &mut self, + mut requests: Vec, + counters: &mut TufArtifactReplicationCounters, + ) { + for request in &requests { + counters.inc(request); + } + + // `Arc::make_mut` will either directly provide a mutable reference + // if there are no other references, or clone it if there are. At this + // point there should never be any other references; we only clone this + // Arc when serializing the ringbuf to a `serde_json::Value`. + let ringbuf = Arc::make_mut(&mut self.request_debug_ringbuf); + let to_delete = (ringbuf.len() + requests.len()) + .saturating_sub(MAX_REQUEST_DEBUG_BUFFER_LEN); + ringbuf.drain(0..to_delete); + + requests.sort(); + ringbuf.extend(requests); + } + + async fn list_sleds(&self, opctx: &OpContext) -> Result> { + Ok(self + .datastore + .sled_list_all_batched(&opctx, SledFilter::TufArtifactReplication) + .await? + .into_iter() + .map(|sled| Sled { + id: SledUuid::from_untyped_uuid(sled.identity.id), + client: sled_client_from_address( + sled.identity.id, + sled.address(), + &opctx.log, + ), + depot_base_url: format!( + "http://{}", + sled.address_with_port(sled.repo_depot_port.into()) + ), + }) + .collect()) + } + + async fn list_artifacts_from_database( + &self, + opctx: &OpContext, + ) -> Result { + let mut inventory = Inventory::default(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + while let Some(p) = paginator.next() { + let batch = self + .datastore + .update_tuf_artifact_list(opctx, &p.current_pagparams()) + .await?; + paginator = p.found_batch(&batch, &|a| a.id.into_untyped_uuid()); + for artifact in batch { + // Two artifacts can have the same sha256 but all the values in + // this insertion step are the default value. + inventory.0.insert( + artifact.sha256.0, + ArtifactPresence { + sleds: BTreeMap::new(), + local: None, + wanted: true, + }, + ); + } + } + Ok(inventory) + } + + /// Ask all sled agents to list the artifacts they have, and mark those + /// artifacts as present on those sleds. + async fn list_artifacts_on_sleds( + &mut self, + opctx: &OpContext, + sleds: &[Sled], + inventory: &mut Inventory, + counters: &mut TufArtifactReplicationCounters, + ) { + let responses = + futures::future::join_all(sleds.iter().map(|sled| async move { + let response = sled.client.artifact_list().await; + (sled, Utc::now(), response) + })) + .await; + let mut requests = Vec::new(); + for (sled, time, response) in responses { + let mut error = None; + match response { + Ok(response) => { + for (hash, count) in response.into_inner() { + let Ok(hash) = ArtifactHash::from_str(&hash) else { + error = Some(format!( + "sled reported bogus artifact hash {hash:?}" + )); + error!( + &opctx.log, + "sled reported bogus artifact hash"; + "sled" => sled.client.baseurl(), + "bogus_hash" => hash, + ); + continue; + }; + let entry = + inventory.0.entry(hash).or_insert_with(|| { + ArtifactPresence { + sleds: BTreeMap::new(), + local: None, + // If we're inserting, this artifact wasn't listed in the + // database. + wanted: false, + } + }); + entry.sleds.insert(sled.id, count); + } + } + Err(err) => { + warn!( + &opctx.log, + "Failed to get artifact list"; + "error" => InlineErrorChain::new(&err), + "sled" => sled.client.baseurl(), + ); + error = Some(err.to_string()); + } + }; + requests.push(TufArtifactReplicationRequest { + time, + target_sled: sled.id, + operation: TufArtifactReplicationOperation::List, + error, + }); + } + self.insert_debug_requests(requests, counters); + } + + /// Fill in the `local` field on the values of `inventory` with any local + /// artifacts, while removing the locally-stored `ArtifactsWithPlan` objects + /// once they reach the minimum requirement to be considered replicated. + fn list_and_clean_up_local_artifacts(&mut self, inventory: &mut Inventory) { + self.local.retain(|plan| { + let mut keep_plan = false; + for hash_id in plan.by_id().values().flatten() { + if let Some(handle) = plan.get_by_hash(hash_id) { + if let Some(presence) = inventory.0.get_mut(&hash_id.hash) { + presence.local = + Some(ArtifactHandle::Extracted(handle)); + if presence.sleds.len() < MIN_SLED_REPLICATION { + keep_plan = true; + } + } + } + } + keep_plan + }) + } +} + +#[cfg(test)] +mod tests { + use std::fmt::Write; + + use expectorate::assert_contents; + use rand::{rngs::StdRng, Rng, SeedableRng}; + + use super::*; + + /// Create a list of `Sled`s suitable for testing + /// `Inventory::into_requests`. Neither the `client` or `depot_base_url` + /// fields are ever used. + fn fake_sleds(n: usize, rng: &mut impl rand::Rng) -> Vec { + (0..n) + .map(|_| Sled { + id: SledUuid::from_untyped_uuid( + uuid::Builder::from_random_bytes(rng.gen()).into_uuid(), + ), + client: sled_agent_client::Client::new( + "http://invalid.test", + slog::Logger::root(slog::Discard, slog::o!()), + ), + depot_base_url: String::new(), + }) + .collect() + } + + fn requests_to_string(requests: &Requests<'_>) -> String { + let mut s = String::new(); + for vec in [ + &requests.put, + &requests.copy_after_put, + &requests.recopy, + &requests.other, + ] { + for request in vec { + match request { + Request::Put { sled, hash, .. } => { + writeln!(s, "- PUT {hash}\n to {}", sled.id) + } + Request::CopyFromDepot { + target_sled, + source_sled, + hash, + } => { + writeln!( + s, + "- COPY {hash}\n from {}\n to {}", + source_sled.id, target_sled.id, + ) + } + Request::Delete { sled, hash } => { + writeln!(s, "- DELETE {hash}\n from {}", sled.id) + } + } + .unwrap(); + } + } + s + } + + fn check_consistency(requests: &Requests<'_>) { + // Everything in `put` should be `Put`. + for request in &requests.put { + assert!( + matches!(request, Request::Put { .. }), + "request in `put` is not `Put`: {request:?}" + ); + } + // Everything in `copy_after_put` should be `CopyFromDepot`, and should + // refer to an artifact that was put onto that sled in `put`. + for request in &requests.copy_after_put { + if let Request::CopyFromDepot { + source_sled, hash: copy_hash, .. + } = request + { + assert!( + requests.put.iter().any(|request| match request { + Request::Put { sled, hash, .. } => { + sled.id == source_sled.id && *hash == *copy_hash + } + _ => false, + }), + "request in `copy_after_put` does not follow from \ + any request in `put`: {request:?}" + ); + } else { + panic!( + "request in `copy_after_put` is not `CopyFromDepot`: \ + {request:?}" + ); + } + } + // Everything in `recopy` should be `Copy`. + for request in &requests.recopy { + assert!( + matches!(request, Request::CopyFromDepot { .. }), + "request in `recopy` is not `CopyFromDepot`: {request:?}" + ); + } + // Everything in `other` should be `Copy` or `Delete`. + for request in &requests.other { + assert!( + matches!( + request, + Request::CopyFromDepot { .. } | Request::Delete { .. } + ), + "request in `other` is not `CopyFromDepot` or `Delete`: \ + {request:?}" + ); + } + } + + #[test] + fn simple_replicate() { + // Replicate 2 local artifacts onto some number of sleds. + let mut rng = StdRng::from_seed(Default::default()); + let sleds = fake_sleds(MIN_SLED_REPLICATION + 13, &mut rng); + let mut inventory = BTreeMap::new(); + for _ in 0..2 { + inventory.insert( + ArtifactHash(rng.gen()), + ArtifactPresence { + sleds: BTreeMap::new(), + local: Some(ArtifactHandle::Fake), + wanted: true, + }, + ); + } + let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + check_consistency(&requests); + assert_eq!(requests.put.len(), MIN_SLED_REPLICATION * 2); + assert_eq!(requests.copy_after_put.len(), 13 * 2); + assert_eq!(requests.recopy.len(), 0); + assert_eq!(requests.other.len(), 0); + assert_contents( + "tests/tuf-replication/simple_replicate.txt", + &requests_to_string(&requests), + ); + } + + #[test] + fn new_sled() { + // Ten artifacts are replicated across 4 sleds, and 1 sled has no + // artifacts. + let mut rng = StdRng::from_seed(Default::default()); + let sleds = fake_sleds(5, &mut rng); + let mut sled_presence = BTreeMap::new(); + for sled in &sleds[..4] { + sled_presence.insert(sled.id, 2); + } + let mut inventory = BTreeMap::new(); + for _ in 0..10 { + inventory.insert( + ArtifactHash(rng.gen()), + ArtifactPresence { + sleds: sled_presence.clone(), + local: None, + wanted: true, + }, + ); + } + let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + check_consistency(&requests); + assert_eq!(requests.put.len(), 0); + assert_eq!(requests.copy_after_put.len(), 0); + assert_eq!(requests.recopy.len(), 0); + assert_eq!(requests.other.len(), 10); + assert_contents( + "tests/tuf-replication/new_sled.txt", + &requests_to_string(&requests), + ); + } + + #[test] + fn delete() { + // 4 sleds have an artifact we don't want anymore. + let mut rng = StdRng::from_seed(Default::default()); + let sleds = fake_sleds(4, &mut rng); + let mut inventory = BTreeMap::new(); + inventory.insert( + ArtifactHash(rng.gen()), + ArtifactPresence { + sleds: sleds.iter().map(|sled| (sled.id, 2)).collect(), + local: None, + wanted: false, + }, + ); + let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + check_consistency(&requests); + assert_eq!(requests.put.len(), 0); + assert_eq!(requests.copy_after_put.len(), 0); + assert_eq!(requests.recopy.len(), 0); + assert_eq!(requests.other.len(), 4); + assert!( + requests + .other + .iter() + .all(|request| matches!(request, Request::Delete { .. })), + "not all requests are deletes" + ); + assert_contents( + "tests/tuf-replication/delete.txt", + &requests_to_string(&requests), + ); + } + + #[test] + fn recopy() { + // 3 sleds have two copies of an artifact; 1 has a single copy. + let mut rng = StdRng::from_seed(Default::default()); + let sleds = fake_sleds(4, &mut rng); + let mut inventory = BTreeMap::new(); + inventory.insert( + ArtifactHash(rng.gen()), + ArtifactPresence { + sleds: sleds + .iter() + .enumerate() + .map(|(i, sled)| (sled.id, if i == 0 { 1 } else { 2 })) + .collect(), + local: None, + wanted: true, + }, + ); + let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + check_consistency(&requests); + assert_eq!(requests.put.len(), 0); + assert_eq!(requests.copy_after_put.len(), 0); + assert_eq!(requests.recopy.len(), 1); + assert_eq!(requests.other.len(), 0); + assert_contents( + "tests/tuf-replication/recopy.txt", + &requests_to_string(&requests), + ); + } + + #[test] + fn nothing() { + // 4 sleds each have two copies of an artifact; there's nothing to do. + let mut rng = StdRng::from_seed(Default::default()); + let sleds = fake_sleds(4, &mut rng); + let mut inventory = BTreeMap::new(); + inventory.insert( + ArtifactHash(rng.gen()), + ArtifactPresence { + sleds: sleds.iter().map(|sled| (sled.id, 2)).collect(), + local: None, + wanted: true, + }, + ); + let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + check_consistency(&requests); + assert_eq!(requests.put.len(), 0); + assert_eq!(requests.copy_after_put.len(), 0); + assert_eq!(requests.recopy.len(), 0); + assert_eq!(requests.other.len(), 0); + } +} diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 636a47f14a8..27511638a35 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -40,6 +40,7 @@ use std::net::{IpAddr, Ipv6Addr}; use std::sync::Arc; use std::sync::OnceLock; use tokio::sync::mpsc; +use update_common::artifacts::ArtifactsWithPlan; use uuid::Uuid; // The implementation of Nexus is large, and split into a number of submodules @@ -215,6 +216,10 @@ pub struct Nexus { /// List of demo sagas awaiting a request to complete them demo_sagas: Arc>, + + /// Sender for TUF repository artifacts temporarily stored in this zone to + /// be replicated out to sleds in the background + tuf_artifact_replication_tx: mpsc::Sender, } impl Nexus { @@ -296,6 +301,14 @@ impl Nexus { saga_create_tx, )); + // Create a channel for replicating repository artifacts. 16 is a + // dubious bound for the channel but it seems unlikely that an operator + // would want to upload more than one at a time, and at most have two + // or three on the system during an upgrade (we've sized the artifact + // datasets to fit at most 10 repositories for this reason). + let (tuf_artifact_replication_tx, tuf_artifact_replication_rx) = + mpsc::channel(16); + let client_state = dpd_client::ClientState { tag: String::from("nexus"), log: log.new(o!( @@ -502,6 +515,7 @@ impl Nexus { demo_sagas: Arc::new(std::sync::Mutex::new( CompletingDemoSagas::new(), )), + tuf_artifact_replication_tx, }; // TODO-cleanup all the extra Arcs here seems wrong @@ -556,6 +570,7 @@ impl Nexus { registry: sagas::ACTION_REGISTRY.clone(), sagas_started_rx: saga_recovery_rx, }, + tuf_artifact_replication_rx, }, ); diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index a8383b97762..5cd3cb9fa61 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -64,6 +64,7 @@ impl super::Nexus { let sled = db::model::SledUpdate::new( id, info.sa_address, + info.repo_depot_port, db::model::SledBaseboard { serial_number: info.baseboard.serial, part_number: info.baseboard.part, diff --git a/nexus/src/app/update/mod.rs b/nexus/src/app/update/mod.rs index d4a47375bc1..57c469e5f25 100644 --- a/nexus/src/app/update/mod.rs +++ b/nexus/src/app/update/mod.rs @@ -11,7 +11,7 @@ use nexus_db_model::TufRepoDescription; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use omicron_common::api::external::{ - Error, SemverVersion, TufRepoInsertResponse, + Error, SemverVersion, TufRepoInsertResponse, TufRepoInsertStatus, }; use omicron_common::update::ArtifactId; use update_common::artifacts::ArtifactsWithPlan; @@ -56,17 +56,32 @@ impl super::Nexus { ArtifactsWithPlan::from_stream(body, Some(file_name), &self.log) .await .map_err(|error| error.to_http_error())?; - // Now store the artifacts in the database. let tuf_repo_description = TufRepoDescription::from_external( artifacts_with_plan.description().clone(), ); - let response = self .db_datastore .update_tuf_repo_insert(opctx, tuf_repo_description) .await .map_err(HttpError::from)?; + + // If we inserted a new repository, move the `ArtifactsWithPlan` (which + // carries with it the `Utf8TempDir`s storing the artifacts) into the + // artifact replication background task, then immediately activate the + // task. + if response.status == TufRepoInsertStatus::Inserted { + self.tuf_artifact_replication_tx + .send(artifacts_with_plan) + .await + .map_err(|err| { + Error::internal_error(&format!( + "failed to send artifacts for replication: {err}" + )) + })?; + self.background_tasks.task_tuf_artifact_replication.activate(); + } + Ok(response.into_external()) } diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 7c8857123c1..84a83631feb 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -32,6 +32,44 @@ fn most_recent_activate_time( } } +/// Given the name of a background task, wait for it to complete if it's +/// running, then return the last polled `BackgroundTask` object. Panics if the +/// task has never been activated. +pub async fn wait_background_task( + internal_client: &ClientTestContext, + task_name: &str, +) -> BackgroundTask { + // Wait for the task to finish + let last_task_poll = wait_for_condition( + || async { + let task = NexusRequest::object_get( + internal_client, + &format!("/bgtasks/view/{task_name}"), + ) + .execute_and_parse_unwrap::() + .await; + + // Wait until the task has actually run and then is idle + if matches!(&task.current, CurrentStatus::Idle) { + match &task.last { + LastResult::Completed(_) => Ok(task), + LastResult::NeverCompleted => { + panic!("task never activated") + } + } + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_millis(500), + &Duration::from_secs(60), + ) + .await + .unwrap(); + + last_task_poll +} + /// Given the name of a background task, activate it, then wait for it to /// complete. Return the last polled `BackgroundTask` object. pub async fn activate_background_task( @@ -337,3 +375,68 @@ pub async fn run_replacement_tasks_to_completion( .await .unwrap(); } + +pub async fn wait_tuf_artifact_replication_step( + internal_client: &ClientTestContext, +) -> TufArtifactReplicationStatus { + let last_background_task = + wait_background_task(&internal_client, "tuf_artifact_replication") + .await; + + let LastResult::Completed(last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from tuf_artifact_replication task", + last_background_task.last, + ); + }; + + let status = serde_json::from_value::( + last_result_completed.details, + ) + .unwrap(); + assert_eq!(status.last_run_counters.err(), 0); + status +} + +pub async fn run_tuf_artifact_replication_step( + internal_client: &ClientTestContext, +) -> TufArtifactReplicationStatus { + let last_background_task = + activate_background_task(&internal_client, "tuf_artifact_replication") + .await; + + let LastResult::Completed(last_result_completed) = + last_background_task.last + else { + panic!( + "unexpected {:?} returned from tuf_artifact_replication task", + last_background_task.last, + ); + }; + + let status = serde_json::from_value::( + last_result_completed.details, + ) + .unwrap(); + assert_eq!(status.last_run_counters.err(), 0); + status +} + +/// Run the `tuf_artifact_replication` task until the status indicates the task +/// has stabilized: no outstanding requests, and no local repositories. Panics +/// if the status does not change between runs. +pub async fn run_tuf_artifact_replication_to_completion( + internal_client: &ClientTestContext, +) { + let mut status = run_tuf_artifact_replication_step(internal_client).await; + while status.local_repos > 0 { + let new_status = + run_tuf_artifact_replication_step(internal_client).await; + if new_status == status { + panic!("TUF artifact replication stalled: {new_status:?}"); + } + status = new_status; + } +} diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 3c220119339..4338afaa360 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -155,6 +155,10 @@ region_snapshot_replacement_start.period_secs = 60 region_snapshot_replacement_garbage_collection.period_secs = 60 region_snapshot_replacement_step.period_secs = 60 region_snapshot_replacement_finish.period_secs = 60 +# The default activation period for this task is 60s, but we want to activate it +# manually to test the result of each activation, so set the activation period +# to something we'll never see in a test run. +tuf_artifact_replication.period_secs = 3600 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/nexus/tests/integration_tests/rack.rs b/nexus/tests/integration_tests/rack.rs index 2cc7dc8977c..f4b67e0afd8 100644 --- a/nexus/tests/integration_tests/rack.rs +++ b/nexus/tests/integration_tests/rack.rs @@ -118,6 +118,7 @@ async fn test_sled_list_uninitialized(cptestctx: &ControlPlaneTestContext) { let sled_uuid = Uuid::new_v4(); let sa = SledAgentInfo { sa_address: "[fd00:1122:3344:0100::1]:8080".parse().unwrap(), + repo_depot_port: 8081, role: SledRole::Gimlet, baseboard, usable_hardware_threads: 32, @@ -211,6 +212,7 @@ async fn test_sled_add(cptestctx: &ControlPlaneTestContext) { .sled_upsert(SledUpdate::new( sled_id.into_untyped_uuid(), "[::1]:0".parse().unwrap(), + 0, SledBaseboard { serial_number: baseboard.serial.clone(), part_number: baseboard.part.clone(), diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs index 3a2c587cd4b..51d93f49caf 100644 --- a/nexus/tests/integration_tests/updates.rs +++ b/nexus/tests/integration_tests/updates.rs @@ -9,11 +9,13 @@ use anyhow::{ensure, Context, Result}; use camino::Utf8Path; -use camino_tempfile::{Builder, Utf8TempDir, Utf8TempPath}; +use camino_tempfile::{Builder, Utf8TempPath}; use clap::Parser; use dropshot::test_util::LogContext; use http::{Method, StatusCode}; use nexus_config::UpdatesConfig; +use nexus_test_utils::background::run_tuf_artifact_replication_step; +use nexus_test_utils::background::wait_tuf_artifact_replication_step; use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; use nexus_test_utils::{load_test_config, test_setup, test_setup_with_config}; use omicron_common::api::external::{ @@ -24,32 +26,15 @@ use omicron_common::api::internal::nexus::KnownArtifactKind; use omicron_sled_agent::sim; use pretty_assertions::assert_eq; use serde::Deserialize; +use std::collections::HashSet; use std::fmt::Debug; -use std::fs::File; use std::io::Write; use tufaceous_lib::assemble::{DeserializedManifest, ManifestTweak}; -const FAKE_MANIFEST_PATH: &'static str = "../tufaceous/manifests/fake.toml"; - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_update_uninitialized() -> Result<()> { +async fn test_repo_upload_unconfigured() -> Result<()> { let mut config = load_test_config(); let logctx = LogContext::new("test_update_uninitialized", &config.pkg.log); - - // Build a fake TUF repo - let temp_dir = Utf8TempDir::new()?; - let archive_path = temp_dir.path().join("archive.zip"); - - let args = tufaceous::Args::try_parse_from([ - "tufaceous", - "assemble", - FAKE_MANIFEST_PATH, - archive_path.as_str(), - ]) - .context("error parsing args")?; - - args.exec(&logctx.log).await.context("error executing assemble command")?; - let cptestctx = test_setup_with_config::( "test_update_uninitialized", &mut config, @@ -60,6 +45,9 @@ async fn test_update_uninitialized() -> Result<()> { .await; let client = &cptestctx.external_client; + // Build a fake TUF repo + let archive_path = make_archive(&logctx.log).await?; + // Attempt to upload the repository to Nexus. This should fail with a 500 // error because the updates system is not configured. { @@ -73,6 +61,15 @@ async fn test_update_uninitialized() -> Result<()> { .context("repository upload should have failed with 500 error")?; } + // The artifact replication background task should have nothing to do. + let status = + run_tuf_artifact_replication_step(&cptestctx.internal_client).await; + assert_eq!( + status.last_run_counters.sum() - status.last_run_counters.list_ok, + 0 + ); + assert_eq!(status.local_repos, 0); + // Attempt to fetch a repository description from Nexus. This should also // fail with a 500 error. { @@ -93,7 +90,7 @@ async fn test_update_uninitialized() -> Result<()> { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_update_end_to_end() -> Result<()> { +async fn test_repo_upload() -> Result<()> { let mut config = load_test_config(); config.pkg.updates = Some(UpdatesConfig { // XXX: This is currently not used by the update system, but @@ -101,31 +98,19 @@ async fn test_update_end_to_end() -> Result<()> { trusted_root: "does-not-exist.json".into(), }); let logctx = LogContext::new("test_update_end_to_end", &config.pkg.log); - - // Build a fake TUF repo - let temp_dir = Utf8TempDir::new()?; - let archive_path = temp_dir.path().join("archive.zip"); - - let args = tufaceous::Args::try_parse_from([ - "tufaceous", - "assemble", - FAKE_MANIFEST_PATH, - archive_path.as_str(), - ]) - .context("error parsing args")?; - - args.exec(&logctx.log).await.context("error executing assemble command")?; - let cptestctx = test_setup_with_config::( "test_update_end_to_end", &mut config, sim::SimMode::Explicit, None, - 0, + 3, // 4 total sled agents ) .await; let client = &cptestctx.external_client; + // Build a fake TUF repo + let archive_path = make_archive(&logctx.log).await?; + // Upload the repository to Nexus. let mut initial_description = { let response = @@ -140,6 +125,33 @@ async fn test_update_end_to_end() -> Result<()> { assert_eq!(response.status, TufRepoInsertStatus::Inserted); response.recorded }; + let unique_sha256_count = initial_description + .artifacts + .iter() + .map(|artifact| artifact.hash) + .collect::>() + .len(); + + // The artifact replication background task should have been activated, and + // we should see a local repo and successful PUTs. + let status = + wait_tuf_artifact_replication_step(&cptestctx.internal_client).await; + eprintln!("{status:?}"); + assert_eq!(status.last_run_counters.list_ok, 4); + assert_eq!(status.last_run_counters.put_ok, 3 * unique_sha256_count); + assert_eq!(status.last_run_counters.copy_ok, unique_sha256_count); + assert_eq!(status.last_run_counters.delete_ok, 0); + // The local repo is not deleted until the next task run. + assert_eq!(status.local_repos, 1); + + // Run the replication background task again; the local repos should be + // dropped. + let status = + run_tuf_artifact_replication_step(&cptestctx.internal_client).await; + eprintln!("{status:?}"); + assert_eq!(status.last_run_counters.list_ok, 4); + assert_eq!(status.last_run_counters.sum(), 4); + assert_eq!(status.local_repos, 0); // Upload the repository to Nexus again. This should return a 200 with an // `AlreadyExists` status. @@ -189,8 +201,6 @@ async fn test_update_end_to_end() -> Result<()> { "initial description matches fetched description" ); - // TODO: attempt to download extracted artifacts. - // Upload a new repository with the same system version but a different // version for one of the components. This will produce a different hash, // which should return an error. @@ -199,8 +209,7 @@ async fn test_update_end_to_end() -> Result<()> { kind: KnownArtifactKind::GimletSp, version: "2.0.0".parse().unwrap(), }]; - let archive_path = - make_tweaked_archive(&logctx.log, &temp_dir, tweaks).await?; + let archive_path = make_tweaked_archive(&logctx.log, tweaks).await?; let response = make_upload_request( client, @@ -229,8 +238,7 @@ async fn test_update_end_to_end() -> Result<()> { size_delta: 1024, }, ]; - let archive_path = - make_tweaked_archive(&logctx.log, &temp_dir, tweaks).await?; + let archive_path = make_tweaked_archive(&logctx.log, tweaks).await?; let response = make_upload_request(client, &archive_path, StatusCode::CONFLICT) @@ -250,8 +258,7 @@ async fn test_update_end_to_end() -> Result<()> { // changes. This should be accepted. { let tweaks = &[ManifestTweak::SystemVersion("2.0.0".parse().unwrap())]; - let archive_path = - make_tweaked_archive(&logctx.log, &temp_dir, tweaks).await?; + let archive_path = make_tweaked_archive(&logctx.log, tweaks).await?; let response = make_upload_request(client, &archive_path, StatusCode::OK) @@ -265,35 +272,51 @@ async fn test_update_end_to_end() -> Result<()> { assert_eq!(response.status, TufRepoInsertStatus::Inserted); } + // No artifacts changed, so the task should have nothing to do and should + // delete the local artifacts. + let status = + wait_tuf_artifact_replication_step(&cptestctx.internal_client).await; + eprintln!("{status:?}"); + assert_eq!(status.last_run_counters.list_ok, 4); + assert_eq!(status.last_run_counters.put_ok, 0); + assert_eq!(status.last_run_counters.copy_ok, 0); + assert_eq!(status.last_run_counters.delete_ok, 0); + assert_eq!(status.local_repos, 0); + cptestctx.teardown().await; logctx.cleanup_successful(); Ok(()) } +async fn make_archive(log: &slog::Logger) -> anyhow::Result { + make_tweaked_archive(log, &[]).await +} + async fn make_tweaked_archive( log: &slog::Logger, - temp_dir: &Utf8TempDir, tweaks: &[ManifestTweak], ) -> anyhow::Result { let manifest = DeserializedManifest::tweaked_fake(tweaks); - let manifest_path = temp_dir.path().join("fake2.toml"); - let mut manifest_file = - File::create(&manifest_path).context("error creating manifest file")?; + let mut manifest_file = Builder::new() + .prefix("manifest") + .suffix(".toml") + .tempfile() + .context("error creating temp file for manifest")?; let manifest_to_toml = manifest.to_toml()?; manifest_file.write_all(manifest_to_toml.as_bytes())?; let archive_path = Builder::new() .prefix("archive") .suffix(".zip") - .tempfile_in(temp_dir.path()) - .context("error creating temp file for tweaked archive")? + .tempfile() + .context("error creating temp file for archive")? .into_temp_path(); let args = tufaceous::Args::try_parse_from([ "tufaceous", "assemble", - manifest_path.as_str(), + manifest_file.path().as_str(), archive_path.as_str(), ]) .context("error parsing args")?; diff --git a/nexus/tests/tuf-replication/delete.txt b/nexus/tests/tuf-replication/delete.txt new file mode 100644 index 00000000000..375179de219 --- /dev/null +++ b/nexus/tests/tuf-replication/delete.txt @@ -0,0 +1,8 @@ +- DELETE f44b581f23222c10916b17a369b4da039d075952b58036f2a7b561446592403c + from 0b20868c-c619-454c-82a1-c61be9902717 +- DELETE f44b581f23222c10916b17a369b4da039d075952b58036f2a7b561446592403c + from b24b8ec1-68fe-4896-b5b7-0ccb10f1a705 +- DELETE f44b581f23222c10916b17a369b4da039d075952b58036f2a7b561446592403c + from e061d0fd-fa18-4618-928b-0d44efef92cb +- DELETE f44b581f23222c10916b17a369b4da039d075952b58036f2a7b561446592403c + from 9b07815f-0449-4e2e-85d2-2cac3aa06141 diff --git a/nexus/tests/tuf-replication/new_sled.txt b/nexus/tests/tuf-replication/new_sled.txt new file mode 100644 index 00000000000..925322104a6 --- /dev/null +++ b/nexus/tests/tuf-replication/new_sled.txt @@ -0,0 +1,30 @@ +- COPY 6b0a41403276437b25f0ffbe897cf476c0777e9d246ff3974ac5f1b350aa16c4 + from b24b8ec1-68fe-4896-b5b7-0ccb10f1a705 + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY 91e0e00df6c012cded9ef9ba5e5bf042464b5ca6488b897c005b2e17f00fc0bc + from 0b20868c-c619-454c-82a1-c61be9902717 + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY f8deccf3c3c518f146de9554f03f27a83d5b1185f39e71ef357e2bd9e1396251 + from 9b07815f-0449-4e2e-85d2-2cac3aa06141 + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY 83fa4146c34e4d5ca8f7f7dfaa083b1c11af4b844ff94b3fbef6e36b518da3ad + from 9b07815f-0449-4e2e-85d2-2cac3aa06141 + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY 4c5856fa686464524a876b463d1297603568c40e814d9d5396d23087a0fd641e + from 0b20868c-c619-454c-82a1-c61be9902717 + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY 20b28cdd32138ae34d22e6397af3a9a71e4efa1fbad16cb0cfafde5b2884858e + from e061d0fd-fa18-4618-928b-0d44efef92cb + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY 2ea0d7117b30c8c8436ce7a9254b30d253e4567ccafa5f36ce84c80aa8bc9be6 + from 0b20868c-c619-454c-82a1-c61be9902717 + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY 358c9f856135236c3e75a925e1c77ac34127c8baefbea0939f1524712b4d37a0 + from 0b20868c-c619-454c-82a1-c61be9902717 + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY 4e0d5ae796884274aef3005ae6733809c8ec1d5b84dd6289e193b9f88de4a994 + from e061d0fd-fa18-4618-928b-0d44efef92cb + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY 9d075952b58036f2a7b561446592403c672f0dbe129f1e39113f9f6164ea2867 + from e061d0fd-fa18-4618-928b-0d44efef92cb + to f44b581f-2322-4c10-916b-17a369b4da03 diff --git a/nexus/tests/tuf-replication/recopy.txt b/nexus/tests/tuf-replication/recopy.txt new file mode 100644 index 00000000000..7212f5e9389 --- /dev/null +++ b/nexus/tests/tuf-replication/recopy.txt @@ -0,0 +1,3 @@ +- COPY f44b581f23222c10916b17a369b4da039d075952b58036f2a7b561446592403c + from 0b20868c-c619-454c-82a1-c61be9902717 + to 9b07815f-0449-4e2e-85d2-2cac3aa06141 diff --git a/nexus/tests/tuf-replication/simple_replicate.txt b/nexus/tests/tuf-replication/simple_replicate.txt new file mode 100644 index 00000000000..7f409ddf87a --- /dev/null +++ b/nexus/tests/tuf-replication/simple_replicate.txt @@ -0,0 +1,90 @@ +- PUT 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + to c8ec1d5b-84dd-4289-a193-b9f88de4a994 +- PUT 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + to 2ea0d711-7b30-48c8-836c-e7a9254b30d2 +- PUT 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + to 0b20868c-c619-454c-82a1-c61be9902717 +- PUT 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + to 53e4567c-cafa-4f36-8e84-c80aa8bc9be6 +- PUT 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + to f44b581f-2322-4c10-916b-17a369b4da03 +- PUT 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + to 0b20868c-c619-454c-82a1-c61be9902717 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 0b20868c-c619-454c-82a1-c61be9902717 + to 53e4567c-cafa-4f36-8e84-c80aa8bc9be6 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from f44b581f-2322-4c10-916b-17a369b4da03 + to b24b8ec1-68fe-4896-b5b7-0ccb10f1a705 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 0b20868c-c619-454c-82a1-c61be9902717 + to f8deccf3-c3c5-48f1-86de-9554f03f27a8 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from f44b581f-2322-4c10-916b-17a369b4da03 + to e061d0fd-fa18-4618-928b-0d44efef92cb +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from f44b581f-2322-4c10-916b-17a369b4da03 + to 6b0a4140-3276-437b-a5f0-ffbe897cf476 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 53e4567c-cafa-4f36-8e84-c80aa8bc9be6 + to 4e0d5ae7-9688-4274-aef3-005ae6733809 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 0b20868c-c619-454c-82a1-c61be9902717 + to 6b0a4140-3276-437b-a5f0-ffbe897cf476 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 0b20868c-c619-454c-82a1-c61be9902717 + to f44b581f-2322-4c10-916b-17a369b4da03 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from c8ec1d5b-84dd-4289-a193-b9f88de4a994 + to 3d5b1185-f39e-41ef-b57e-2bd9e1396251 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 2ea0d711-7b30-48c8-836c-e7a9254b30d2 + to 4e0d5ae7-9688-4274-aef3-005ae6733809 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 53e4567c-cafa-4f36-8e84-c80aa8bc9be6 + to 9b07815f-0449-4e2e-85d2-2cac3aa06141 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 53e4567c-cafa-4f36-8e84-c80aa8bc9be6 + to c0777e9d-246f-4397-8ac5-f1b350aa16c4 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 0b20868c-c619-454c-82a1-c61be9902717 + to b24b8ec1-68fe-4896-b5b7-0ccb10f1a705 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 2ea0d711-7b30-48c8-836c-e7a9254b30d2 + to 358c9f85-6135-436c-be75-a925e1c77ac3 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 0b20868c-c619-454c-82a1-c61be9902717 + to 9b07815f-0449-4e2e-85d2-2cac3aa06141 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from c8ec1d5b-84dd-4289-a193-b9f88de4a994 + to 672f0dbe-129f-4e39-913f-9f6164ea2867 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 2ea0d711-7b30-48c8-836c-e7a9254b30d2 + to c0777e9d-246f-4397-8ac5-f1b350aa16c4 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 0b20868c-c619-454c-82a1-c61be9902717 + to f8deccf3-c3c5-48f1-86de-9554f03f27a8 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 0b20868c-c619-454c-82a1-c61be9902717 + to 672f0dbe-129f-4e39-913f-9f6164ea2867 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 0b20868c-c619-454c-82a1-c61be9902717 + to c8ec1d5b-84dd-4289-a193-b9f88de4a994 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 53e4567c-cafa-4f36-8e84-c80aa8bc9be6 + to e061d0fd-fa18-4618-928b-0d44efef92cb +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from c8ec1d5b-84dd-4289-a193-b9f88de4a994 + to 9d075952-b580-46f2-a7b5-61446592403c +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 0b20868c-c619-454c-82a1-c61be9902717 + to 2ea0d711-7b30-48c8-836c-e7a9254b30d2 +- COPY 1e4efa1fbad16cb0cfafde5b2884858e83fa4146c34e4d5ca8f7f7dfaa083b1c + from 0b20868c-c619-454c-82a1-c61be9902717 + to 358c9f85-6135-436c-be75-a925e1c77ac3 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 2ea0d711-7b30-48c8-836c-e7a9254b30d2 + to 3d5b1185-f39e-41ef-b57e-2bd9e1396251 +- COPY 4127c8baefbea0939f1524712b4d37a020b28cdd32138ae34d22e6397af3a9a7 + from 0b20868c-c619-454c-82a1-c61be9902717 + to 9d075952-b580-46f2-a7b5-61446592403c diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index 993afe0ebd5..054c673d486 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -706,6 +706,9 @@ pub enum SledFilter { /// Sleds which should be sent VPC firewall rules. VpcFirewall, + + /// Sleds which should have TUF repo artifacts replicated onto them. + TufArtifactReplication, } impl SledFilter { @@ -761,6 +764,7 @@ impl SledPolicy { SledFilter::ReservationCreate => true, SledFilter::VpcRouting => true, SledFilter::VpcFirewall => true, + SledFilter::TufArtifactReplication => true, }, SledPolicy::InService { provision_policy: SledProvisionPolicy::NonProvisionable, @@ -774,6 +778,7 @@ impl SledPolicy { SledFilter::ReservationCreate => false, SledFilter::VpcRouting => true, SledFilter::VpcFirewall => true, + SledFilter::TufArtifactReplication => true, }, SledPolicy::Expunged => match filter { SledFilter::All => true, @@ -785,6 +790,7 @@ impl SledPolicy { SledFilter::ReservationCreate => false, SledFilter::VpcRouting => false, SledFilter::VpcFirewall => false, + SledFilter::TufArtifactReplication => false, }, } } @@ -818,6 +824,7 @@ impl SledState { SledFilter::ReservationCreate => true, SledFilter::VpcRouting => true, SledFilter::VpcFirewall => true, + SledFilter::TufArtifactReplication => true, }, SledState::Decommissioned => match filter { SledFilter::All => true, @@ -829,6 +836,7 @@ impl SledState { SledFilter::ReservationCreate => false, SledFilter::VpcRouting => false, SledFilter::VpcFirewall => false, + SledFilter::TufArtifactReplication => false, }, } } diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index b5f7fd19d39..99c8c7d2a32 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -2,10 +2,16 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use chrono::DateTime; +use chrono::Utc; +use omicron_common::update::ArtifactHash; +use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::SupportBundleUuid; use serde::Deserialize; use serde::Serialize; use std::collections::BTreeMap; +use std::collections::VecDeque; +use std::sync::Arc; use uuid::Uuid; /// The status of a `region_replacement` background task activation @@ -231,3 +237,112 @@ impl SupportBundleCollectionReport { } } } + +/// The status of a `tuf_artifact_replication` background task activation +#[derive(Debug, Serialize, Deserialize, PartialEq)] +pub struct TufArtifactReplicationStatus { + pub last_run_counters: TufArtifactReplicationCounters, + pub lifetime_counters: TufArtifactReplicationCounters, + pub request_debug_ringbuf: Arc>, + pub local_repos: usize, +} + +impl TufArtifactReplicationStatus { + pub fn last_run_ok(&self) -> bool { + self.last_run_counters.list_err == 0 + && self.last_run_counters.put_err == 0 + && self.last_run_counters.copy_err == 0 + && self.last_run_counters.delete_err == 0 + } +} + +#[derive( + Debug, + Default, + Clone, + Copy, + Serialize, + Deserialize, + PartialEq, + derive_more::AddAssign, +)] +pub struct TufArtifactReplicationCounters { + pub list_ok: usize, + pub list_err: usize, + pub put_ok: usize, + pub put_err: usize, + pub copy_ok: usize, + pub copy_err: usize, + pub delete_ok: usize, + pub delete_err: usize, +} + +impl TufArtifactReplicationCounters { + pub fn inc(&mut self, request: &TufArtifactReplicationRequest) { + match (&request.operation, &request.error) { + (TufArtifactReplicationOperation::List, Some(_)) => { + self.list_err += 1 + } + (TufArtifactReplicationOperation::List, None) => self.list_ok += 1, + (TufArtifactReplicationOperation::Put { .. }, Some(_)) => { + self.put_err += 1 + } + (TufArtifactReplicationOperation::Put { .. }, None) => { + self.put_ok += 1 + } + (TufArtifactReplicationOperation::Copy { .. }, Some(_)) => { + self.copy_err += 1 + } + (TufArtifactReplicationOperation::Copy { .. }, None) => { + self.copy_ok += 1 + } + (TufArtifactReplicationOperation::Delete { .. }, Some(_)) => { + self.delete_err += 1 + } + (TufArtifactReplicationOperation::Delete { .. }, None) => { + self.delete_ok += 1 + } + } + } + + pub fn ok(&self) -> usize { + self.list_ok + .saturating_add(self.put_ok) + .saturating_add(self.copy_ok) + .saturating_add(self.delete_ok) + } + + pub fn err(&self) -> usize { + self.list_err + .saturating_add(self.put_err) + .saturating_add(self.copy_err) + .saturating_add(self.delete_err) + } + + pub fn sum(&self) -> usize { + self.ok().saturating_add(self.err()) + } +} + +#[derive( + Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, +)] +pub struct TufArtifactReplicationRequest { + pub time: DateTime, + pub target_sled: SledUuid, + #[serde(flatten)] + pub operation: TufArtifactReplicationOperation, + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, +} + +#[derive( + Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, +)] +#[serde(tag = "operation", rename_all = "snake_case")] +pub enum TufArtifactReplicationOperation { + List, + Put { hash: ArtifactHash }, + Copy { hash: ArtifactHash, source_sled: SledUuid }, + Delete { hash: ArtifactHash }, +} diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index bbc75653703..a094df614cf 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -36,6 +36,9 @@ pub struct SledAgentInfo { /// The address of the sled agent's API endpoint pub sa_address: SocketAddrV6, + /// The port of the Repo Depot API endpoint, on the same IP as `sa_address` + pub repo_depot_port: u16, + /// Describes the responsibilities of the sled pub role: SledRole, diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index a408e54dff2..8760fa35868 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -5497,6 +5497,12 @@ } ] }, + "repo_depot_port": { + "description": "The port of the Repo Depot API endpoint, on the same IP as `sa_address`", + "type": "integer", + "format": "uint16", + "minimum": 0 + }, "reservoir_size": { "description": "Amount of RAM dedicated to the VMM reservoir\n\nMust be smaller than \"usable_physical_ram\"", "allOf": [ @@ -5536,6 +5542,7 @@ "baseboard", "decommissioned", "generation", + "repo_depot_port", "reservoir_size", "role", "sa_address", diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ec4300b1fa0..f52f71d451d 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -188,7 +188,11 @@ CREATE TABLE IF NOT EXISTS omicron.public.sled ( sled_state omicron.public.sled_state NOT NULL, /* Generation number owned and incremented by the sled-agent */ - sled_agent_gen INT8 NOT NULL DEFAULT 1 + sled_agent_gen INT8 NOT NULL DEFAULT 1, + + /* The bound port of the Repo Depot API server, running on the same IP as + the sled agent server. */ + repo_depot_port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL ); -- Add an index that ensures a given physical sled (identified by serial and @@ -2321,12 +2325,17 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo ( id UUID PRIMARY KEY, time_created TIMESTAMPTZ NOT NULL, + -- TODO: Repos fetched over HTTP will not have a SHA256 hash; this is an + -- implementation detail of our ZIP archives. sha256 STRING(64) NOT NULL, -- The version of the targets.json role that was used to generate the repo. targets_role_version INT NOT NULL, -- The valid_until time for the repo. + -- TODO: Figure out timestamp validity policy for uploaded repos vs those + -- fetched over HTTP; my (iliana's) current presumption is that we will make + -- this NULL for uploaded ZIP archives of repos. valid_until TIMESTAMPTZ NOT NULL, -- The system version described in the TUF repo. @@ -4815,7 +4824,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '120.0.0', NULL) + (TRUE, NOW(), NOW(), '121.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/tuf-artifact-replication/up01.sql b/schema/crdb/tuf-artifact-replication/up01.sql new file mode 100644 index 00000000000..3a6c80f4297 --- /dev/null +++ b/schema/crdb/tuf-artifact-replication/up01.sql @@ -0,0 +1,7 @@ +ALTER TABLE omicron.public.sled + ADD COLUMN IF NOT EXISTS repo_depot_port INT4 + CHECK (port BETWEEN 0 AND 65535) + -- This is the value of the `REPO_DEPOT_PORT` const. If we're running + -- this migration, we're running on a "real" system and this is + -- definitely the correct port. (The DEFAULT value is removed in up03.) + NOT NULL DEFAULT 12348; diff --git a/schema/crdb/tuf-artifact-replication/up02.sql b/schema/crdb/tuf-artifact-replication/up02.sql new file mode 100644 index 00000000000..142ac4c60ae --- /dev/null +++ b/schema/crdb/tuf-artifact-replication/up02.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.sled + ALTER COLUMN repo_depot_port DROP DEFAULT; diff --git a/sled-agent/src/artifact_store.rs b/sled-agent/src/artifact_store.rs index 73ff03c7623..0ec7fb68f11 100644 --- a/sled-agent/src/artifact_store.rs +++ b/sled-agent/src/artifact_store.rs @@ -20,7 +20,7 @@ use std::net::SocketAddrV6; use std::str::FromStr; use std::time::Duration; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8PathBuf; use camino_tempfile::{NamedUtf8TempFile, Utf8TempPath}; use dropshot::{ Body, ConfigDropshot, FreeformBody, HttpError, HttpResponseOk, Path, @@ -28,7 +28,6 @@ use dropshot::{ }; use futures::{Stream, TryStreamExt}; use omicron_common::address::REPO_DEPOT_PORT; -use omicron_common::disk::{DatasetKind, DatasetsConfig}; use omicron_common::update::ArtifactHash; use repo_depot_api::*; use sha2::{Digest, Sha256}; @@ -37,7 +36,7 @@ use sled_storage::dataset::M2_ARTIFACT_DATASET; use sled_storage::error::Error as StorageError; use sled_storage::manager::StorageHandle; use slog::{error, info, Logger}; -use slog_error_chain::SlogInlineError; +use slog_error_chain::{InlineErrorChain, SlogInlineError}; use tokio::fs::{File, OpenOptions}; use tokio::io::AsyncWriteExt; @@ -444,25 +443,10 @@ pub(crate) trait DatasetsManager: Sync { ) -> Result + '_, StorageError>; } -/// Iterator `.filter().map()` common to `DatasetsManager` implementations. -pub(crate) fn filter_dataset_mountpoints( - config: DatasetsConfig, - root: &Utf8Path, -) -> impl Iterator + '_ { - config - .datasets - .into_values() - .filter(|dataset| *dataset.name.kind() == DatasetKind::Update) - .map(|dataset| dataset.name.mountpoint(root)) -} - impl DatasetsManager for StorageHandle { async fn artifact_storage_paths( &self, ) -> Result + '_, StorageError> { - // TODO: When datasets are managed by Reconfigurator (#6229), - // this should be changed to use `self.datasets_config_list()` and - // `filter_dataset_mountpoints`. Ok(self .get_latest_disks() .await @@ -686,25 +670,33 @@ pub(crate) enum Error { impl From for HttpError { fn from(err: Error) -> HttpError { match err { + // 4xx errors + Error::HashMismatch { .. } => { + HttpError::for_bad_request(None, err.to_string()) + } + Error::NotFound { .. } => { + HttpError::for_not_found(None, err.to_string()) + } Error::AlreadyInProgress { .. } => HttpError::for_client_error( None, dropshot::ClientErrorStatusCode::CONFLICT, err.to_string(), ), + + // 5xx errors: ensure the error chain is logged Error::Body(inner) => inner, Error::DatasetConfig(_) | Error::NoUpdateDataset => { - HttpError::for_unavail(None, err.to_string()) + HttpError::for_unavail( + None, + InlineErrorChain::new(&err).to_string(), + ) } Error::DepotCopy { .. } | Error::File { .. } | Error::FileRename { .. } - | Error::Join(_) => HttpError::for_internal_error(err.to_string()), - Error::HashMismatch { .. } => { - HttpError::for_bad_request(None, err.to_string()) - } - Error::NotFound { .. } => { - HttpError::for_not_found(None, err.to_string()) - } + | Error::Join(_) => HttpError::for_internal_error( + InlineErrorChain::new(&err).to_string(), + ), } } } @@ -764,10 +756,14 @@ mod test { &self, ) -> Result + '_, StorageError> { - Ok(super::filter_dataset_mountpoints( - self.datasets.clone(), - self.mountpoint_root.path(), - )) + Ok(self + .datasets + .datasets + .values() + .filter(|dataset| *dataset.name.kind() == DatasetKind::Update) + .map(|dataset| { + dataset.name.mountpoint(self.mountpoint_root.path()) + })) } } diff --git a/sled-agent/src/nexus.rs b/sled-agent/src/nexus.rs index d1646823bb6..fbacdce1136 100644 --- a/sled-agent/src/nexus.rs +++ b/sled-agent/src/nexus.rs @@ -164,6 +164,7 @@ type GetSledAgentInfo = Box SledAgentInfo + Send>; pub struct NexusNotifierInput { pub sled_id: SledUuid, pub sled_address: SocketAddrV6, + pub repo_depot_port: u16, pub nexus_client: NexusClient, pub hardware: HardwareManager, pub vmm_reservoir_manager: VmmReservoirManagerHandle, @@ -248,6 +249,7 @@ impl NexusNotifierTask { let NexusNotifierInput { sled_id, sled_address, + repo_depot_port, nexus_client, hardware, vmm_reservoir_manager, @@ -265,6 +267,7 @@ impl NexusNotifierTask { }; SledAgentInfo { sa_address: sled_address.to_string(), + repo_depot_port, role, baseboard: hardware.baseboard().convert(), usable_hardware_threads: hardware.online_processor_count(), @@ -643,6 +646,7 @@ mod test { let latest_sled_agent_info = Arc::new(std::sync::Mutex::new(SledAgentInfo { sa_address: sa_address.clone(), + repo_depot_port: 0, role: nexus_client::types::SledRole::Gimlet, baseboard: Baseboard::new_pc("test".into(), "test".into()) .convert(), diff --git a/sled-agent/src/sim/artifact_store.rs b/sled-agent/src/sim/artifact_store.rs index efb4fa9f36e..5f74f6b82ef 100644 --- a/sled-agent/src/sim/artifact_store.rs +++ b/sled-agent/src/sim/artifact_store.rs @@ -6,21 +6,29 @@ //! storage. use camino_tempfile::Utf8TempDir; +use dropshot::{ + Body, ConfigDropshot, FreeformBody, HttpError, HttpResponseOk, HttpServer, + Path, RequestContext, ServerBuilder, +}; +use repo_depot_api::*; use sled_storage::error::Error as StorageError; +use std::sync::Arc; -use super::storage::Storage; -use crate::artifact_store::DatasetsManager; +use crate::artifact_store::{ArtifactStore, DatasetsManager}; +#[derive(Clone)] pub(super) struct SimArtifactStorage { - root: Utf8TempDir, - backend: Storage, + // We simulate the two M.2s with two separate temporary directories. + dirs: Arc<[Utf8TempDir; 2]>, } impl SimArtifactStorage { - pub(super) fn new(backend: Storage) -> SimArtifactStorage { + pub(super) fn new() -> SimArtifactStorage { SimArtifactStorage { - root: camino_tempfile::tempdir().unwrap(), - backend, + dirs: Arc::new([ + camino_tempfile::tempdir().unwrap(), + camino_tempfile::tempdir().unwrap(), + ]), } } } @@ -30,14 +38,45 @@ impl DatasetsManager for SimArtifactStorage { &self, ) -> Result + '_, StorageError> { - let config = self - .backend - .lock() - .datasets_config_list() - .map_err(|_| StorageError::LedgerNotFound)?; - Ok(crate::artifact_store::filter_dataset_mountpoints( - config, - self.root.path(), - )) + Ok(self.dirs.iter().map(|tempdir| tempdir.path().to_owned())) + } +} + +impl ArtifactStore { + pub(super) fn start( + &self, + log: &slog::Logger, + dropshot_config: &ConfigDropshot, + ) -> HttpServer { + ServerBuilder::new( + repo_depot_api_mod::api_description::() + .expect("registered entrypoints"), + self.clone(), + log.new(o!("component" => "dropshot (Repo Depot)")), + ) + .config(dropshot_config.clone()) + .start() + .unwrap() + } +} + +/// Implementation of the Repo Depot API backed by an +/// `ArtifactStore`. +pub(super) enum RepoDepotImpl {} + +impl RepoDepotApi for RepoDepotImpl { + type Context = ArtifactStore; + + async fn artifact_get_by_sha256( + rqctx: RequestContext, + path_params: Path, + ) -> Result, HttpError> { + let sha256 = path_params.into_inner().sha256; + let file = rqctx.context().get(sha256).await?; + let file_access = hyper_staticfile::vfs::TokioFileAccess::new(file); + let file_stream = + hyper_staticfile::util::FileBytesStream::new(file_access); + let body = Body::wrap(hyper_staticfile::Body::Full(file_stream)); + Ok(HttpResponseOk(FreeformBody(body))) } } diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index ad22e42950a..ffdd1bb45de 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -120,6 +120,7 @@ impl Server { // TODO-robustness if this returns a 400 error, we probably want to // return a permanent error from the `notify_nexus` closure. let sa_address = http_server.local_addr(); + let repo_depot_port = sled_agent.repo_depot.local_addr().port(); let config_clone = config.clone(); let log_clone = log.clone(); let task = tokio::spawn(async move { @@ -133,6 +134,7 @@ impl Server { &config.id, &NexusTypes::SledAgentInfo { sa_address: sa_address.to_string(), + repo_depot_port, role: NexusTypes::SledRole::Scrimlet, baseboard: NexusTypes::Baseboard { serial: format!( diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 96544c26f75..3109f9b6d04 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -104,7 +104,8 @@ pub struct SledAgent { fake_zones: Mutex, instance_ensure_state_error: Mutex>, pub bootstore_network_config: Mutex, - artifacts: ArtifactStore, + pub(super) repo_depot: + dropshot::HttpServer>, pub log: Logger, } @@ -187,8 +188,8 @@ impl SledAgent { config.storage.ip, storage_log, ); - let artifacts = - ArtifactStore::new(&log, SimArtifactStorage::new(storage.clone())); + let repo_depot = ArtifactStore::new(&log, SimArtifactStorage::new()) + .start(&log, &config.dropshot); Arc::new(SledAgent { id, @@ -218,7 +219,7 @@ impl SledAgent { zones: vec![], }), instance_ensure_state_error: Mutex::new(None), - artifacts, + repo_depot, log, bootstore_network_config, }) @@ -594,7 +595,7 @@ impl SledAgent { } pub(super) fn artifact_store(&self) -> &ArtifactStore { - &self.artifacts + self.repo_depot.app_private() } pub async fn vmm_count(&self) -> usize { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 4e3b1710388..89abfdfde34 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -585,11 +585,16 @@ impl SledAgent { ) .await?; + let repo_depot = ArtifactStore::new(&log, storage_manager.clone()) + .start(sled_address, &config.dropshot) + .await?; + // Spawn a background task for managing notifications to nexus // about this sled-agent. let nexus_notifier_input = NexusNotifierInput { sled_id: request.body.id, sled_address: get_sled_address(request.body.subnet), + repo_depot_port: repo_depot.local_addr().port(), nexus_client: nexus_client.clone(), hardware: long_running_task_handles.hardware_manager.clone(), vmm_reservoir_manager: vmm_reservoir_manager.clone(), @@ -611,10 +616,6 @@ impl SledAgent { log.new(o!("component" => "ProbeManager")), ); - let repo_depot = ArtifactStore::new(&log, storage_manager.clone()) - .start(sled_address, &config.dropshot) - .await?; - let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { id: request.body.id, diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 28c03ef4241..d4cba6f53d1 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -73,6 +73,7 @@ region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 +tuf_artifact_replication.period_secs = 300 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index bfc04f5b974..f9c54715382 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -73,6 +73,7 @@ region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 +tuf_artifact_replication.period_secs = 300 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. diff --git a/tufaceous/manifests/fake.toml b/tufaceous/manifests/fake.toml index a71a5e853fd..c3f6404f537 100644 --- a/tufaceous/manifests/fake.toml +++ b/tufaceous/manifests/fake.toml @@ -23,7 +23,7 @@ version = "1.0.0" [artifact.host.source] kind = "composite-host" phase_1 = { kind = "fake", size = "512KiB" } -phase_2 = { kind = "fake", size = "3MiB" } +phase_2 = { kind = "fake", size = "1MiB" } [[artifact.trampoline]] name = "fake-trampoline" @@ -31,7 +31,7 @@ version = "1.0.0" [artifact.trampoline.source] kind = "composite-host" phase_1 = { kind = "fake", size = "512KiB" } -phase_2 = { kind = "fake", size = "3MiB" } +phase_2 = { kind = "fake", size = "1MiB" } [[artifact.control_plane]] name = "fake-control-plane" diff --git a/update-common/src/artifacts/extracted_artifacts.rs b/update-common/src/artifacts/extracted_artifacts.rs index 5ac4a3a3954..309b188a9dc 100644 --- a/update-common/src/artifacts/extracted_artifacts.rs +++ b/update-common/src/artifacts/extracted_artifacts.rs @@ -25,8 +25,8 @@ use tokio_util::io::ReaderStream; /// Handle to the data of an extracted artifact. /// -/// This does not contain the actual data; use `reader_stream()` to get a new -/// handle to the underlying file to read it on demand. +/// This does not contain the actual data; use `file()` or `reader_stream()` to +/// get a new handle to the underlying file to read it on demand. /// /// Note that although this type implements `Clone` and that cloning is /// relatively cheap, it has additional implications on filesystem cleanup. @@ -69,20 +69,26 @@ impl ExtractedArtifactDataHandle { self.hash_id.hash } - /// Async stream to read the contents of this artifact on demand. + /// Opens the file for this artifact. /// /// This can fail due to I/O errors outside our control (e.g., something /// removed the contents of our temporary directory). - pub async fn reader_stream( - &self, - ) -> anyhow::Result> { + pub async fn file(&self) -> anyhow::Result { let path = path_for_artifact(&self.tempdir, &self.hash_id); - let file = tokio::fs::File::open(&path) + tokio::fs::File::open(&path) .await - .with_context(|| format!("failed to open {path}"))?; + .with_context(|| format!("failed to open {path}")) + } - Ok(ReaderStream::new(file)) + /// Async stream to read the contents of this artifact on demand. + /// + /// This can fail due to I/O errors outside our control (e.g., something + /// removed the contents of our temporary directory). + pub async fn reader_stream( + &self, + ) -> anyhow::Result> { + Ok(ReaderStream::new(self.file().await?)) } } From 56bc6fc559c73c86dcc686f6f1483a74288e0f0e Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 28 Jan 2025 17:44:48 +0000 Subject: [PATCH 2/8] move min sled replication to config --- nexus-config/src/nexus_config.rs | 8 ++- nexus/Cargo.toml | 4 -- nexus/examples/config-second.toml | 1 + nexus/examples/config.toml | 1 + nexus/src/app/background/init.rs | 1 + .../tasks/tuf_artifact_replication.rs | 54 +++++++++++++------ nexus/tests/config.test.toml | 2 + smf/nexus/multi-sled/config-partial.toml | 1 + smf/nexus/single-sled/config-partial.toml | 1 + 9 files changed, 52 insertions(+), 21 deletions(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index dfaa847851f..10f7c0e3106 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -730,6 +730,9 @@ pub struct TufArtifactReplicationConfig { /// period (in seconds) for periodic activations of this background task #[serde_as(as = "DurationSeconds")] pub period_secs: Duration, + /// The number of sleds that artifacts must be present on before a local + /// copy of a repo's artifacts is dropped. + pub min_sled_replication: usize, } /// Configuration for a nexus server @@ -989,6 +992,7 @@ mod test { region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 + tuf_artifact_replication.min_sled_replication = 3 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1187,7 +1191,8 @@ mod test { }, tuf_artifact_replication: TufArtifactReplicationConfig { - period_secs: Duration::from_secs(300) + period_secs: Duration::from_secs(300), + min_sled_replication: 3, }, }, default_region_allocation_strategy: @@ -1273,6 +1278,7 @@ mod test { region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 + tuf_artifact_replication.min_sled_replication = 3 [default_region_allocation_strategy] type = "random" "##, diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 9a02bd40c94..fbc86a42c26 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -7,10 +7,6 @@ license = "MPL-2.0" [lints] workspace = true -[features] -# Set by omicron-package based on the target configuration. -rack-topology-single-sled = [] - [build-dependencies] omicron-rpaths.workspace = true diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index a423ae25b22..6e27e849f97 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -142,6 +142,7 @@ region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 +tuf_artifact_replication.min_sled_replication = 3 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 972ae2f0140..8307a6a25db 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -128,6 +128,7 @@ region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 +tuf_artifact_replication.min_sled_replication = 3 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index cda5ccd997b..eed15a1e5a5 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -890,6 +890,7 @@ impl BackgroundTasksInitializer { tuf_artifact_replication::ArtifactReplication::new( datastore.clone(), args.tuf_artifact_replication_rx, + config.tuf_artifact_replication.min_sled_replication, ), ), opctx: opctx.child(BTreeMap::new()), diff --git a/nexus/src/app/background/tasks/tuf_artifact_replication.rs b/nexus/src/app/background/tasks/tuf_artifact_replication.rs index aae41bbdae9..360edbbe441 100644 --- a/nexus/src/app/background/tasks/tuf_artifact_replication.rs +++ b/nexus/src/app/background/tasks/tuf_artifact_replication.rs @@ -89,13 +89,6 @@ use crate::app::background::BackgroundTask; // to reduce bandwidth spikes for PUT requests; other requests should return // quickly. const MAX_REQUEST_CONCURRENCY: usize = 8; -// The number of sleds that artifacts must be present on before the local copy -// of artifacts is dropped. This is ignored if there are fewer than this many -// sleds in the system. -#[cfg(not(feature = "rack-topology-single-sled"))] -const MIN_SLED_REPLICATION: usize = 3; -#[cfg(feature = "rack-topology-single-sled")] -const MIN_SLED_REPLICATION: usize = 1; // The number of copies of an artifact we expect each sled to have. This is currently 2 because // sleds store a copy of each artifact on each M.2 device. const EXPECTED_COUNT: u32 = 2; @@ -150,6 +143,7 @@ impl Inventory { self, sleds: &'a [Sled], rng: &mut impl rand::Rng, + min_sled_replication: usize, ) -> Requests<'a> { let mut requests = Requests::default(); for (hash, presence) in self.0 { @@ -169,7 +163,7 @@ impl Inventory { // meet `MIN_SLED_REPLICATION`. let mut sled_puts = Vec::new(); if let Some(handle) = presence.local { - let count = MIN_SLED_REPLICATION + let count = min_sled_replication .saturating_sub(sleds_present.len()); for _ in 0..count { let Some(sled) = sleds_not_present.pop() else { @@ -442,6 +436,7 @@ pub struct ArtifactReplication { datastore: Arc, local: Vec, local_rx: mpsc::Receiver, + min_sled_replication: usize, /// List of recent requests for debugging. request_debug_ringbuf: Arc>, lifetime_counters: TufArtifactReplicationCounters, @@ -485,8 +480,11 @@ impl BackgroundTask for ArtifactReplication { .await; self.list_and_clean_up_local_artifacts(&mut inventory); - let requests = - inventory.into_requests(&sleds, &mut rand::thread_rng()); + let requests = inventory.into_requests( + &sleds, + &mut rand::thread_rng(), + self.min_sled_replication, + ); let completed = requests .into_stream(&opctx.log) .buffer_unordered(MAX_REQUEST_CONCURRENCY) @@ -511,11 +509,13 @@ impl ArtifactReplication { pub fn new( datastore: Arc, local_rx: mpsc::Receiver, + min_sled_replication: usize, ) -> ArtifactReplication { ArtifactReplication { datastore, local: Vec::new(), local_rx, + min_sled_replication, request_debug_ringbuf: Arc::new(VecDeque::new()), lifetime_counters: TufArtifactReplicationCounters::default(), } @@ -669,7 +669,7 @@ impl ArtifactReplication { if let Some(presence) = inventory.0.get_mut(&hash_id.hash) { presence.local = Some(ArtifactHandle::Extracted(handle)); - if presence.sleds.len() < MIN_SLED_REPLICATION { + if presence.sleds.len() < self.min_sled_replication { keep_plan = true; } } @@ -689,6 +689,8 @@ mod tests { use super::*; + const MIN_SLED_REPLICATION: usize = 3; + /// Create a list of `Sled`s suitable for testing /// `Inventory::into_requests`. Neither the `client` or `depot_base_url` /// fields are ever used. @@ -809,7 +811,11 @@ mod tests { }, ); } - let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + let requests = Inventory(inventory).into_requests( + &sleds, + &mut rng, + MIN_SLED_REPLICATION, + ); check_consistency(&requests); assert_eq!(requests.put.len(), MIN_SLED_REPLICATION * 2); assert_eq!(requests.copy_after_put.len(), 13 * 2); @@ -842,7 +848,11 @@ mod tests { }, ); } - let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + let requests = Inventory(inventory).into_requests( + &sleds, + &mut rng, + MIN_SLED_REPLICATION, + ); check_consistency(&requests); assert_eq!(requests.put.len(), 0); assert_eq!(requests.copy_after_put.len(), 0); @@ -868,7 +878,11 @@ mod tests { wanted: false, }, ); - let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + let requests = Inventory(inventory).into_requests( + &sleds, + &mut rng, + MIN_SLED_REPLICATION, + ); check_consistency(&requests); assert_eq!(requests.put.len(), 0); assert_eq!(requests.copy_after_put.len(), 0); @@ -905,7 +919,11 @@ mod tests { wanted: true, }, ); - let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + let requests = Inventory(inventory).into_requests( + &sleds, + &mut rng, + MIN_SLED_REPLICATION, + ); check_consistency(&requests); assert_eq!(requests.put.len(), 0); assert_eq!(requests.copy_after_put.len(), 0); @@ -931,7 +949,11 @@ mod tests { wanted: true, }, ); - let requests = Inventory(inventory).into_requests(&sleds, &mut rng); + let requests = Inventory(inventory).into_requests( + &sleds, + &mut rng, + MIN_SLED_REPLICATION, + ); check_consistency(&requests); assert_eq!(requests.put.len(), 0); assert_eq!(requests.copy_after_put.len(), 0); diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 4338afaa360..634a98c8937 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -159,6 +159,8 @@ region_snapshot_replacement_finish.period_secs = 60 # manually to test the result of each activation, so set the activation period # to something we'll never see in a test run. tuf_artifact_replication.period_secs = 3600 +# Update integration tests are started with 4 sled agents. +tuf_artifact_replication.min_sled_replication = 3 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index d4cba6f53d1..f9b24c663f1 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -74,6 +74,7 @@ region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 +tuf_artifact_replication.min_sled_replication = 3 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index f9c54715382..ba951ba4e07 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -74,6 +74,7 @@ region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 +tuf_artifact_replication.min_sled_replication = 1 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. From 8ee633da71a35cc098e4adb6586467d72cbf11b3 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 28 Jan 2025 18:24:18 +0000 Subject: [PATCH 3/8] other review changes --- .../tasks/tuf_artifact_replication.rs | 44 ++++++++++++------- nexus/src/app/update/mod.rs | 8 ++++ nexus/test-utils/src/background.rs | 17 ------- 3 files changed, 35 insertions(+), 34 deletions(-) diff --git a/nexus/src/app/background/tasks/tuf_artifact_replication.rs b/nexus/src/app/background/tasks/tuf_artifact_replication.rs index 360edbbe441..d6b63ca15cf 100644 --- a/nexus/src/app/background/tasks/tuf_artifact_replication.rs +++ b/nexus/src/app/background/tasks/tuf_artifact_replication.rs @@ -49,16 +49,16 @@ //! `MAX_REQUEST_CONCURRENCY`, which is intended to reduce bandwidth spikes for //! PUT requests. //! -//! TODO: In addition to Nexus concurrency rate limits, we should also rate -//! limit requests per second sent by Nexus, as well as limit the number of -//! ongoing copy requests being processed at once by Sled Agent. +//! TODO: (omicron#7400) In addition to Nexus concurrency rate limits, we should +//! also rate limit requests per second sent by Nexus, as well as limit the +//! number of ongoing copy requests being processed at once by Sled Agent. use std::collections::{BTreeMap, VecDeque}; use std::future::Future; use std::str::FromStr; use std::sync::Arc; -use anyhow::Result; +use anyhow::{Context, Result}; use chrono::Utc; use futures::future::BoxFuture; use futures::{FutureExt, Stream, StreamExt}; @@ -461,16 +461,23 @@ impl BackgroundTask for ArtifactReplication { } } - let sleds = match self.list_sleds(opctx).await { + let sleds = match self + .list_sleds(opctx) + .await + .context("failed to list sleds") + { Ok(sleds) => sleds, Err(err) => return json!({"error": format!("{err:#}")}), }; let mut counters = TufArtifactReplicationCounters::default(); - let mut inventory = - match self.list_artifacts_from_database(opctx).await { - Ok(inventory) => inventory, - Err(err) => return json!({"error": format!("{err:#}")}), - }; + let mut inventory = match self + .list_artifacts_from_database(opctx) + .await + .context("failed to list artifacts from database") + { + Ok(inventory) => inventory, + Err(err) => return json!({"error": format!("{err:#}")}), + }; self.list_artifacts_on_sleds( opctx, &sleds, @@ -577,16 +584,13 @@ impl ArtifactReplication { .await?; paginator = p.found_batch(&batch, &|a| a.id.into_untyped_uuid()); for artifact in batch { - // Two artifacts can have the same sha256 but all the values in - // this insertion step are the default value. - inventory.0.insert( - artifact.sha256.0, + inventory.0.entry(artifact.sha256.0).or_insert_with(|| { ArtifactPresence { sleds: BTreeMap::new(), local: None, wanted: true, - }, - ); + } + }); } } Ok(inventory) @@ -612,6 +616,11 @@ impl ArtifactReplication { let mut error = None; match response { Ok(response) => { + info!( + &opctx.log, + "Successfully got artifact list"; + "sled" => sled.client.baseurl(), + ); for (hash, count) in response.into_inner() { let Ok(hash) = ArtifactHash::from_str(&hash) else { error = Some(format!( @@ -619,7 +628,8 @@ impl ArtifactReplication { )); error!( &opctx.log, - "sled reported bogus artifact hash"; + "Failed to get artifact list: \ + sled reported bogus artifact hash"; "sled" => sled.client.baseurl(), "bogus_hash" => hash, ); diff --git a/nexus/src/app/update/mod.rs b/nexus/src/app/update/mod.rs index 57c469e5f25..d9d02a2c3a9 100644 --- a/nexus/src/app/update/mod.rs +++ b/nexus/src/app/update/mod.rs @@ -75,6 +75,14 @@ impl super::Nexus { .send(artifacts_with_plan) .await .map_err(|err| { + // In theory this should never happen; `Sender::send` + // returns an error only if the receiver has hung up, and + // the receiver should live for as long as Nexus does (it + // belongs to the background task driver). + // + // If this _does_ happen, the impact is that the database + // has recorded a repository for which we no longer have + // the artifacts. Error::internal_error(&format!( "failed to send artifacts for replication: {err}" )) diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 84a83631feb..b01046543bf 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -423,20 +423,3 @@ pub async fn run_tuf_artifact_replication_step( assert_eq!(status.last_run_counters.err(), 0); status } - -/// Run the `tuf_artifact_replication` task until the status indicates the task -/// has stabilized: no outstanding requests, and no local repositories. Panics -/// if the status does not change between runs. -pub async fn run_tuf_artifact_replication_to_completion( - internal_client: &ClientTestContext, -) { - let mut status = run_tuf_artifact_replication_step(internal_client).await; - while status.local_repos > 0 { - let new_status = - run_tuf_artifact_replication_step(internal_client).await; - if new_status == status { - panic!("TUF artifact replication stalled: {new_status:?}"); - } - status = new_status; - } -} From 2a0166358660ae8d61555fdb5f79cfd1f5e008f7 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 28 Jan 2025 19:07:13 +0000 Subject: [PATCH 4/8] comment reflows --- .../app/background/tasks/tuf_artifact_replication.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nexus/src/app/background/tasks/tuf_artifact_replication.rs b/nexus/src/app/background/tasks/tuf_artifact_replication.rs index d6b63ca15cf..b540c7db905 100644 --- a/nexus/src/app/background/tasks/tuf_artifact_replication.rs +++ b/nexus/src/app/background/tasks/tuf_artifact_replication.rs @@ -89,8 +89,8 @@ use crate::app::background::BackgroundTask; // to reduce bandwidth spikes for PUT requests; other requests should return // quickly. const MAX_REQUEST_CONCURRENCY: usize = 8; -// The number of copies of an artifact we expect each sled to have. This is currently 2 because -// sleds store a copy of each artifact on each M.2 device. +// The number of copies of an artifact we expect each sled to have. This is +// currently 2 because sleds store a copy of each artifact on each M.2 device. const EXPECTED_COUNT: u32 = 2; // How many recent requests to remember for debugging purposes. At 32 sleds // and 64 artifacts per repo, this is enough to remember at least the most @@ -159,8 +159,8 @@ impl Inventory { }); sleds_not_present.shuffle(rng); - // If we have a local copy, PUT the artifact to more sleds until we - // meet `MIN_SLED_REPLICATION`. + // If we have a local copy, PUT the artifact to more sleds until + // we meet `MIN_SLED_REPLICATION`. let mut sled_puts = Vec::new(); if let Some(handle) = presence.local { let count = min_sled_replication @@ -640,8 +640,8 @@ impl ArtifactReplication { ArtifactPresence { sleds: BTreeMap::new(), local: None, - // If we're inserting, this artifact wasn't listed in the - // database. + // If we're inserting, this artifact wasn't + // listed in the database. wanted: false, } }); From 3e4362c706b94afe05c48da9749cbdd490029e3f Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 28 Jan 2025 23:48:30 +0000 Subject: [PATCH 5/8] work around #7417 with a redactor --- dev-tools/omdb/tests/successes.out | 16 +-- dev-tools/omdb/tests/test_all_output.rs | 8 ++ test-utils/src/dev/test_cmds.rs | 152 +++++++++++++++++++++++- 3 files changed, 164 insertions(+), 12 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index adf732ed43a..617d159a00e 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -739,11 +739,9 @@ task: "tuf_artifact_replication" last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms request ringbuf: - - target sled: ..................... - operation: List - at: + last run: - list ok: 1 + list ok: list err: 0 put ok: 0 put err: 0 @@ -752,7 +750,7 @@ task: "tuf_artifact_replication" delete ok: 0 delete err: 0 lifetime: - list ok: 1 + list ok: list err: 0 put ok: 0 put err: 0 @@ -1232,11 +1230,9 @@ task: "tuf_artifact_replication" last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms request ringbuf: - - target sled: ..................... - operation: List - at: + last run: - list ok: 1 + list ok: list err: 0 put ok: 0 put err: 0 @@ -1245,7 +1241,7 @@ task: "tuf_artifact_replication" delete ok: 0 delete err: 0 lifetime: - list ok: 1 + list ok: list err: 0 put ok: 0 put err: 0 diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 75a74c2d321..c975fc36d8a 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -219,6 +219,14 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { redactor.extra_variable_length("cockroachdb_version", &crdb_version); } + // The `tuf_artifact_replication` task's output depends on how + // many sleds happened to register with Nexus before its first + // execution. These redactions work around the issue described in + // https://github.com/oxidecomputer/omicron/issues/7417. + redactor + .field("list ok:", r"\d+") + .section(&["task: \"tuf_artifact_replication\"", "request ringbuf:"]); + for args in invocations { println!("running commands with args: {:?}", args); let p = postgres_url.to_string(); diff --git a/test-utils/src/dev/test_cmds.rs b/test-utils/src/dev/test_cmds.rs index d2662abb1ce..e226103dc18 100644 --- a/test-utils/src/dev/test_cmds.rs +++ b/test-utils/src/dev/test_cmds.rs @@ -130,18 +130,32 @@ pub struct Redactor<'a> { basic: bool, uuids: bool, extra: Vec<(&'a str, String)>, + extra_regex: Vec<(regex::Regex, String)>, + sections: Vec<&'a [&'a str]>, } impl Default for Redactor<'_> { fn default() -> Self { - Self { basic: true, uuids: true, extra: Vec::new() } + Self { + basic: true, + uuids: true, + extra: Vec::new(), + extra_regex: Vec::new(), + sections: Vec::new(), + } } } impl<'a> Redactor<'a> { /// Create a new redactor that does not do any redactions. pub fn noop() -> Self { - Self { basic: false, uuids: false, extra: Vec::new() } + Self { + basic: false, + uuids: false, + extra: Vec::new(), + extra_regex: Vec::new(), + sections: Vec::new(), + } } pub fn basic(&mut self, basic: bool) -> &mut Self { @@ -180,6 +194,70 @@ impl<'a> Redactor<'a> { self } + /// Redact the value of a named field with a known value shape. + /// + /// This can be used for redacting a common value (such as a small integer) + /// that changes from run to run but otherwise doesn't have any context + /// that helps it be redacted using other methods. The value will only + /// be redacted if it matches `name` concatenated with one or more spaces + /// concatenated with a string matching `value_regex`. An example: + /// + /// ``` + /// # let mut redactor = Redactor::default(); + /// redactor.field("list ok:", "\d+"); + /// ``` + /// + /// will replace `list ok: 1` with `list ok: `. + pub fn field(&mut self, name: &str, value_regex: &str) -> &mut Self { + let re = regex::Regex::new(&format!( + r"\b(?{} +){}\b", + regex::escape(name), + value_regex + )) + .unwrap(); + let replacement = format!( + "$prefix<{}_REDACTED>", + name.replace(|c: char| c.is_ascii_punctuation(), "") + .replace(" ", "_") + .to_uppercase() + ); + self.extra_regex.push((re, replacement)); + self + } + + /// Redact an entire indented section. + /// + /// This can be used if the shape of a section might change from run to run. + /// + /// `headings` is the path of heading names indicating the section to + /// redact. For example, to redact only the first "ringbuf:" section from + /// this output: + /// + /// ```ignore + /// section A: + /// nested: + /// ringbuf: + /// this should be redacted + /// section B: + /// ringbuf: + /// this should not be redacted + /// ``` + /// + /// we can use: + /// + /// ``` + /// # let mut redactor = Redactor::default(); + /// redactor.section(&["section A:", "ringbuf:"]); + /// ``` + /// + /// Note that not all section headings need to be listed in `headings` in + /// order for the section to be redacted. + pub fn section(&mut self, headings: &'a [&'a str]) -> &mut Self { + assert!(!headings.is_empty(), "headings should not be empty"); + self.sections.push(headings); + self + } + pub fn do_redact(&self, input: &str) -> String { // Perform extra redactions at the beginning, not the end. This is because // some of the built-in redactions in redact_variable might match a @@ -189,6 +267,12 @@ impl<'a> Redactor<'a> { for (name, replacement) in &self.extra { s = s.replace(name, replacement); } + for (regex, replacement) in &self.extra_regex { + s = regex.replace_all(&s, replacement).into_owned(); + } + for headings in &self.sections { + s = redact_section(&s, headings); + } if self.basic { s = redact_basic(&s); @@ -298,6 +382,39 @@ fn redact_uuids(input: &str) -> String { .to_string() } +fn redact_section(input: &str, headings: &[&str]) -> String { + let mut output = String::new(); + let mut indent_stack = Vec::new(); + let mut print_redacted = false; + for line in input.split_inclusive('\n') { + let indent = line.len() - line.trim_start().len(); + if !line.trim().is_empty() { + while let Some(last) = indent_stack.pop() { + if indent > last { + indent_stack.push(last); + break; + } + } + } + if indent_stack.len() == headings.len() { + if print_redacted { + print_redacted = false; + output.push_str(&line[..indent]); + output.push_str("\n"); + } + continue; + } + output.push_str(line); + if line[indent..].trim_end() == headings[indent_stack.len()] { + indent_stack.push(indent); + if indent_stack.len() == headings.len() { + print_redacted = true; + } + } + } + output +} + fn fill_redaction_text(name: &str, text_to_redact_len: usize) -> String { // The overall plan is to generate a string of the form // ------, depending on the length of the text to @@ -378,4 +495,35 @@ mod tests { ); } } + + #[test] + fn test_redact_section() { + const INPUT: &str = "\ +section A: + nested: + ringbuf: + this should be redacted + a second line to be redacted + + a line followed by an empty line +section B: + ringbuf: + this should not be redacted + + a line followed by an empty line"; + const OUTPUT: &str = "\ +section A: + nested: + ringbuf: + +section B: + ringbuf: + this should not be redacted + + a line followed by an empty line"; + + let mut redactor = Redactor::default(); + redactor.section(&["section A:", "ringbuf:"]); + assert_eq!(redactor.do_redact(INPUT), OUTPUT); + } } From a21023d035cabeec52441bb170d507b74c9a3655 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Tue, 28 Jan 2025 23:49:12 +0000 Subject: [PATCH 6/8] config nits --- nexus/examples/config-second.toml | 2 +- nexus/examples/config.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 6e27e849f97..c21e470a6d5 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -142,7 +142,7 @@ region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 -tuf_artifact_replication.min_sled_replication = 3 +tuf_artifact_replication.min_sled_replication = 1 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 8307a6a25db..78a72487c8f 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -128,7 +128,7 @@ region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 -tuf_artifact_replication.min_sled_replication = 3 +tuf_artifact_replication.min_sled_replication = 1 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. From 79638c36f239c0e08f6ad305342083bd2863509c Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 29 Jan 2025 00:03:43 +0000 Subject: [PATCH 7/8] fix redactor doctests --- test-utils/src/dev/test_cmds.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test-utils/src/dev/test_cmds.rs b/test-utils/src/dev/test_cmds.rs index e226103dc18..6661e003f8b 100644 --- a/test-utils/src/dev/test_cmds.rs +++ b/test-utils/src/dev/test_cmds.rs @@ -203,8 +203,9 @@ impl<'a> Redactor<'a> { /// concatenated with a string matching `value_regex`. An example: /// /// ``` + /// # use omicron_test_utils::dev::test_cmds::Redactor; /// # let mut redactor = Redactor::default(); - /// redactor.field("list ok:", "\d+"); + /// redactor.field("list ok:", r"\d+"); /// ``` /// /// will replace `list ok: 1` with `list ok: `. @@ -246,6 +247,7 @@ impl<'a> Redactor<'a> { /// we can use: /// /// ``` + /// # use omicron_test_utils::dev::test_cmds::Redactor; /// # let mut redactor = Redactor::default(); /// redactor.section(&["section A:", "ringbuf:"]); /// ``` From f12aa0a1227b4f42b25828e311e4b844535743a4 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 29 Jan 2025 05:56:50 +0000 Subject: [PATCH 8/8] also work around #7417 if zero sleds registered --- dev-tools/omdb/src/bin/omdb/nexus.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 64176c39d07..ce044b983de 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -2165,6 +2165,9 @@ fn print_task_tuf_artifact_replication(details: &serde_json::Value) { ), Ok(status) => { println!(" request ringbuf:"); + if status.request_debug_ringbuf.is_empty() { + println!(" [no entries]"); + } for TufArtifactReplicationRequest { time, target_sled,