From 426cd7616acd19b3de02b60e9dd2c1e86a9924cd Mon Sep 17 00:00:00 2001 From: Adnan Dosani <258082943+adnandnv@users.noreply.github.com> Date: Thu, 18 Jun 2026 18:52:33 -0700 Subject: [PATCH] feat(observability): propagate W3C trace context across nico-api's network boundaries nico-api now accepts and produces W3C Trace Context headers (traceparent, tracestate) at its network boundaries, so a request already traced upstream stays one trace as it flows through nico-api and on to the services it calls. - Install the standard W3C TraceContextPropagator at startup (the OTel default is no-op). - Ingress (REST + gRPC): one shared per-request layer extracts the inbound context and parents the request span to it; with no valid inbound context the span is a fresh root. - Egress: gRPC clients are wrapped in a tower TraceInjectService; reqwest clients are built through reqwest-tracing's middleware; the two OAuth2 token providers inject manually, since the oauth2 crate builds their requests. - The local tracing-enabled flag stays the master switch: an inbound sampled flag never forces recording here. Shared extract/inject glue and the tower middleware live in a new trace-propagation crate, with unit tests and an ingress->egress round-trip. Per-client details and maintainer guidance are in docs/observability/tracing.md. Closes #2438. Signed-off-by: Adnan Dosani <258082943+adnandnv@users.noreply.github.com> --- Cargo.lock | 63 +++ Cargo.toml | 5 + crates/api-core/Cargo.toml | 3 + crates/api-core/src/handlers/redfish.rs | 19 +- crates/api-core/src/logging/api_logs.rs | 7 + crates/api-core/src/logging/setup.rs | 230 +++++++++- .../src/machine_identity/token_exchange.rs | 21 +- crates/api-web/Cargo.toml | 1 + crates/api-web/src/auth.rs | 4 +- crates/component-manager/Cargo.toml | 1 + crates/component-manager/src/nsm.rs | 5 +- crates/component-manager/src/psm.rs | 7 +- crates/component-manager/src/tls.rs | 9 +- crates/firmware/Cargo.toml | 2 + crates/firmware/src/downloader.rs | 10 +- crates/libmlx/Cargo.toml | 2 + crates/libmlx/src/firmware/source.rs | 7 +- crates/libnmxc/Cargo.toml | 1 + crates/libnmxc/src/lib.rs | 3 +- crates/libnmxc/src/nmxc_api.rs | 5 +- crates/libnmxm/Cargo.toml | 2 + crates/libnmxm/src/lib.rs | 14 +- crates/mqttea/Cargo.toml | 1 + crates/mqttea/src/auth/oauth2_provider.rs | 4 +- crates/nras/Cargo.toml | 2 + crates/nras/src/client.rs | 11 +- crates/nras/src/lib.rs | 6 + crates/rpc/Cargo.toml | 1 + crates/rpc/src/forge_tls_client.rs | 12 +- crates/trace-propagation/Cargo.toml | 40 ++ crates/trace-propagation/src/lib.rs | 398 ++++++++++++++++++ .../tests/no_global_propagator.rs | 86 ++++ docs/observability/tracing.md | 80 +++- 33 files changed, 994 insertions(+), 68 deletions(-) create mode 100644 crates/trace-propagation/Cargo.toml create mode 100644 crates/trace-propagation/src/lib.rs create mode 100644 crates/trace-propagation/tests/no_global_propagator.rs diff --git a/Cargo.lock b/Cargo.lock index 0a311b13a9..41cd6cb8ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1246,6 +1246,8 @@ dependencies = [ "rcgen", "regex", "reqwest 0.13.4", + "reqwest-middleware", + "reqwest-tracing", "rsa 0.9.10", "rumqttc", "rustls", @@ -1273,6 +1275,7 @@ dependencies = [ "tonic-reflection", "tower", "tower-http", + "trace-propagation", "tracing", "tracing-log", "tracing-opentelemetry", @@ -1492,6 +1495,7 @@ dependencies = [ "tonic", "tower", "tower-http", + "trace-propagation", "tracing", "url", "urlencoding", @@ -1878,6 +1882,8 @@ dependencies = [ "hex", "md5 0.8.0", "reqwest 0.13.4", + "reqwest-middleware", + "reqwest-tracing", "serde", "sqlx", "tokio", @@ -2179,6 +2185,8 @@ dependencies = [ "quick-xml 0.39.4", "regex", "reqwest 0.13.4", + "reqwest-middleware", + "reqwest-tracing", "russh", "serde", "serde_json", @@ -2683,6 +2691,7 @@ dependencies = [ "tonic-prost", "tonic-prost-build", "tower", + "trace-propagation", "tracing", "tryhard", "uuid", @@ -3500,6 +3509,7 @@ dependencies = [ "tonic", "tonic-prost", "tonic-prost-build", + "trace-propagation", "tracing", "uuid", ] @@ -6504,6 +6514,7 @@ dependencies = [ "tonic", "tonic-prost", "tonic-prost-build", + "trace-propagation", "tracing", "tracing-subscriber", ] @@ -6516,6 +6527,8 @@ dependencies = [ "async-trait", "http", "reqwest 0.13.4", + "reqwest-middleware", + "reqwest-tracing", "serde", "serde_derive", "serde_json", @@ -7054,6 +7067,7 @@ dependencies = [ "thiserror 2.0.18", "tokio", "tokio-test", + "trace-propagation", "tracing", "tracing-subscriber", "uuid", @@ -7355,6 +7369,8 @@ dependencies = [ "jsonwebtoken", "mockito", "reqwest 0.13.4", + "reqwest-middleware", + "reqwest-tracing", "serde", "serde_json", "thiserror 2.0.18", @@ -9221,6 +9237,38 @@ dependencies = [ "web-sys", ] +[[package]] +name = "reqwest-middleware" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bc3f1384cffa4f274dad2d4ddd73aed32fed8f786d96c6be8aa4e5fd3c3b58" +dependencies = [ + "anyhow", + "async-trait", + "http", + "reqwest 0.13.4", + "thiserror 2.0.18", + "tower-service", +] + +[[package]] +name = "reqwest-tracing" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e5af0cd6fc3d3c8f703d597af70b6e4e62432c63157b49419fa1ffaf481702" +dependencies = [ + "anyhow", + "async-trait", + "getrandom 0.2.17", + "http", + "matchit", + "opentelemetry", + "reqwest 0.13.4", + "reqwest-middleware", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "resolv-conf" version = "0.7.6" @@ -11651,6 +11699,21 @@ dependencies = [ "tower-service", ] +[[package]] +name = "trace-propagation" +version = "0.0.0" +dependencies = [ + "http", + "opentelemetry", + "opentelemetry-http", + "opentelemetry_sdk", + "tokio", + "tower", + "tracing", + "tracing-opentelemetry", + "tracing-subscriber", +] + [[package]] name = "tracing" version = "0.1.44" diff --git a/Cargo.toml b/Cargo.toml index b08d7aea86..54ba812c87 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,6 +52,7 @@ sqlx-macros-core = { version = "0.9" } opentelemetry = { version = "0.31.0", features = ["logs"] } opentelemetry_sdk = { version = "0.31.0", features = ["logs", "rt-tokio"] } +opentelemetry-http = "0.31.0" opentelemetry-otlp = { version = "0.31.1", features = ["metrics"] } opentelemetry-prometheus = "0.31.0" opentelemetry-semantic-conventions = "0.31.0" @@ -195,6 +196,10 @@ ratatui = "0.30.0" rcgen = "0.14.7" regex = "1.11.2" reqwest = { default-features = false, version = "0.13.3" } +reqwest-middleware = "0.5" +reqwest-tracing = { version = "0.7", default-features = false, features = [ + "opentelemetry_0_31", +] } resolv-conf = "0.7.0" ringbuf = "0.5.0" rsa = "0.9.9" diff --git a/crates/api-core/Cargo.toml b/crates/api-core/Cargo.toml index f2b1378a11..b1b92be0b8 100644 --- a/crates/api-core/Cargo.toml +++ b/crates/api-core/Cargo.toml @@ -76,6 +76,7 @@ nras = { path = "../nras" } spancounter = { path = "../spancounter" } state-controller = { path = "../state-controller" } sqlx-query-tracing = { path = "../sqlx-query-tracing" } +trace-propagation = { path = "../trace-propagation" } # External dependencies. PLEASE KEEP ALPHABETIZED ORDER. arc-swap = { workspace = true } @@ -132,6 +133,8 @@ reqwest = { workspace = true, default-features = false, features = [ "rustls", "stream", ] } +reqwest-middleware = { workspace = true } +reqwest-tracing = { workspace = true } rsa = { workspace = true } rumqttc = { workspace = true } rustls = { workspace = true } diff --git a/crates/api-core/src/handlers/redfish.rs b/crates/api-core/src/handlers/redfish.rs index f4ed5fec48..78b0dfd256 100644 --- a/crates/api-core/src/handlers/redfish.rs +++ b/crates/api-core/src/handlers/redfish.rs @@ -393,7 +393,7 @@ pub(crate) async fn create_client( rpc::forge::BmcMetaDataGetResponse, http::Uri, HeaderMap, - reqwest::Client, + reqwest_middleware::ClientWithMiddleware, ), CarbideError, > { @@ -450,7 +450,7 @@ pub(crate) async fn create_client( .connect_timeout(std::time::Duration::from_secs(5)) // Limit connections to 5 seconds .timeout(std::time::Duration::from_secs(60)); // Limit the overall request to 60 seconds - match builder.build() { + let client = match builder.build() { Ok(client) => client, Err(err) => { tracing::error!(%err, "build_http_client"); @@ -458,7 +458,12 @@ pub(crate) async fn create_client( "Http building failed: {err}" ))); } - } + }; + // The `reqwest-tracing` middleware injects the current span's W3C trace context into every + // outgoing request (#2438). + reqwest_middleware::ClientBuilder::new(client) + .with(reqwest_tracing::TracingMiddleware::default()) + .build() }; Ok((metadata, new_uri, headers, http_client)) } @@ -558,15 +563,15 @@ impl TestBehavior { } } -// Subset of the data we care about from reqwest::Error, so that we can mock it (we can't build our -// own reqwest::Error as its constructors are all private.) +// Subset of the data we care about from the HTTP error, so that we can mock it (we can't build our +// own reqwest error as its constructors are all private.) pub struct RequestErrorInfo { pub status_code: Option, pub description: String, } -impl From for RequestErrorInfo { - fn from(e: reqwest::Error) -> Self { +impl From for RequestErrorInfo { + fn from(e: reqwest_middleware::Error) -> Self { Self { status_code: e.status(), description: e.to_string(), diff --git a/crates/api-core/src/logging/api_logs.rs b/crates/api-core/src/logging/api_logs.rs index d80119355e..5bfb4fea5c 100644 --- a/crates/api-core/src/logging/api_logs.rs +++ b/crates/api-core/src/logging/api_logs.rs @@ -163,6 +163,13 @@ where tenant.organization_id = tracing::field::Empty, ); + // Continue any distributed trace the caller started: make the inbound W3C trace + // context (`traceparent`/`tracestate`) the parent of this request span. The shared + // tower layer serves both the gRPC and REST listeners, so this single extractor + // covers both ingress paths. No-op when there is no valid inbound context, leaving + // the request span a fresh root (issue #2438). + trace_propagation::set_span_parent_from_headers(&request_span, request.headers()); + // Try to extract the gRPC service and method from the URI let mut grpc_method: Option = None; let mut grpc_service: Option = None; diff --git a/crates/api-core/src/logging/setup.rs b/crates/api-core/src/logging/setup.rs index 20e64e704b..a4fda23da0 100644 --- a/crates/api-core/src/logging/setup.rs +++ b/crates/api-core/src/logging/setup.rs @@ -21,12 +21,15 @@ use std::sync::atomic::{AtomicBool, Ordering}; use carbide_metrics_utils::OtelView; use eyre::WrapErr; use opentelemetry::metrics::{Meter, MeterProvider}; -use opentelemetry::trace::{Link, SamplingDecision, SamplingResult, SpanKind, TracerProvider}; -use opentelemetry::{Context, KeyValue, TraceId, Value}; +use opentelemetry::trace::{ + Link, SamplingDecision, SamplingResult, SpanKind, TraceContextExt, TracerProvider, +}; +use opentelemetry::{Context, KeyValue, TraceId, Value, global}; use opentelemetry_otlp::{ExportConfig, WithExportConfig}; use opentelemetry_sdk::Resource; use opentelemetry_sdk::metrics::SdkMeterProvider; -use opentelemetry_sdk::trace::{Sampler, ShouldSample}; +use opentelemetry_sdk::propagation::TraceContextPropagator; +use opentelemetry_sdk::trace::ShouldSample; use opentelemetry_semantic_conventions as semcov; use spancounter::SpanCountReader; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; @@ -81,6 +84,11 @@ pub fn setup_logging( log_history_max_bytes: usize, tracing_config: &TracingConfig, ) -> eyre::Result { + // Install the global W3C trace-context propagator so NICo can extract inbound and inject + // outbound `traceparent`/`tracestate` at network boundaries (issue #2438). Without it, + // OpenTelemetry's default propagator is a no-op. + global::set_text_map_propagator(TraceContextPropagator::new()); + // This configures emission of logs in LogFmt syntax // and emission of metrics let log_level = match debug { @@ -150,7 +158,7 @@ pub fn setup_logging( let tracer_provider = opentelemetry_sdk::trace::SdkTracerProvider::builder() // CarbideSpanSampler selects explicitly marked application trace roots. - .with_sampler(trace_sampler.into_sampler()) + .with_sampler(trace_sampler) .with_batch_exporter(otlp_exporter) .with_resource( Resource::builder() @@ -259,11 +267,6 @@ impl CarbideSpanSampler { fn new(enabled: Arc) -> Self { Self(enabled) } - - /// Construct a new Sampler that samples spans originating from carbide-api. - fn into_sampler(self) -> Sampler { - Sampler::ParentBased(Box::new(self)) - } } /// Predicate to check if a child span or event should be accepted. This is distinct from @@ -283,7 +286,7 @@ fn should_accept_span_or_event(metadata: &tracing::Metadata<'_>) -> bool { impl ShouldSample for CarbideSpanSampler { fn should_sample( &self, - _parent_context: Option<&Context>, + parent_context: Option<&Context>, _trace_id: TraceId, _name: &str, _span_kind: &SpanKind, @@ -292,23 +295,34 @@ impl ShouldSample for CarbideSpanSampler { ) -> SamplingResult { let enabled = self.0.load(Ordering::Relaxed); - // We want this to short-circuit if enabled is false, because we want to skip iterating - // through all attributes. (This could be a simple && expression but it should be really - // clear from reading the code here.) - let should_sample = if !enabled { - false - } else { - is_carbide_root_span(attributes) - }; + let parent_span_context = parent_context + .map(|cx| cx.span().span_context().clone()) + .filter(|sc| sc.is_valid()); - SamplingResult { - decision: if should_sample { + let decision = if !enabled { + SamplingDecision::Drop + } else if let Some(local_parent) = parent_span_context.as_ref().filter(|sc| !sc.is_remote()) + { + // In-process parent: inherit its sampling decision. + if local_parent.is_sampled() { SamplingDecision::RecordAndSample } else { SamplingDecision::Drop - }, + } + } else if is_carbide_root_span(attributes) { + // Root or remote-parented span: record iff explicitly marked `carbide.trace_root`. + SamplingDecision::RecordAndSample + } else { + SamplingDecision::Drop + }; + + SamplingResult { + decision, attributes: vec![], - trace_state: Default::default(), + // Carry any inbound `tracestate` so it propagates onward unchanged. + trace_state: parent_span_context + .map(|sc| sc.trace_state().clone()) + .unwrap_or_default(), } } } @@ -407,6 +421,178 @@ mod tests { assert!(matches!(decision, SamplingDecision::Drop)); } + fn sample_decision_with_parent( + enabled: bool, + parent: Option<&Context>, + attributes: Vec, + ) -> SamplingDecision { + CarbideSpanSampler::new(Arc::new(AtomicBool::new(enabled))) + .should_sample( + parent, + TraceId::from(1u128), + "request", + &SpanKind::Server, + &attributes, + &[], + ) + .decision + } + + /// A parent `Context` carrying a span context with the given sampled / remote flags. + fn parent_ctx(sampled: bool, remote: bool) -> Context { + use opentelemetry::trace::{SpanContext, SpanId, TraceContextExt, TraceFlags, TraceState}; + + let sc = SpanContext::new( + TraceId::from(0x1111_1111_1111_1111_1111_1111_1111_1111u128), + SpanId::from(0x2222_2222_2222_2222u64), + if sampled { + TraceFlags::SAMPLED + } else { + TraceFlags::default() + }, + remote, + TraceState::default(), + ); + Context::new().with_remote_span_context(sc) + } + + fn root_marker() -> Vec { + vec![KeyValue::new(CARBIDE_TRACE_ROOT_ATTRIBUTE, true)] + } + + #[test] + fn sampler_disabled_drops_even_with_sampled_remote_parent() { + // issue #2438: the local toggle is the master switch — an inbound sampled trace + // must not be recorded while tracing is disabled locally. + assert!(matches!( + sample_decision_with_parent(false, Some(&parent_ctx(true, true)), root_marker()), + SamplingDecision::Drop + )); + } + + #[test] + fn sampler_remote_parent_gated_by_marker_not_inbound_sampled() { + // A remote (ingress) parent never inherits its decision: the local `carbide.trace_root` + // marker decides, whether or not the inbound trace was sampled. + for remote_sampled in [true, false] { + let parent = parent_ctx(remote_sampled, true); + assert!( + matches!( + sample_decision_with_parent(true, Some(&parent), root_marker()), + SamplingDecision::RecordAndSample + ), + "marked -> record (inbound sampled={remote_sampled})" + ); + assert!( + matches!( + sample_decision_with_parent(true, Some(&parent), vec![]), + SamplingDecision::Drop + ), + "unmarked -> drop (inbound sampled={remote_sampled})" + ); + } + } + + #[test] + fn sampler_inherits_in_process_parent_decision() { + // Sampled in-process parent pulls its subtree in, even without a marker. + assert!(matches!( + sample_decision_with_parent(true, Some(&parent_ctx(true, false)), vec![]), + SamplingDecision::RecordAndSample + )); + // Unsampled in-process parent drops the subtree, even if the child is marked. + assert!(matches!( + sample_decision_with_parent(true, Some(&parent_ctx(false, false)), root_marker()), + SamplingDecision::Drop + )); + } + + #[test] + fn sampler_carries_parent_tracestate() { + // The inbound `tracestate` must ride through the SamplingResult so it propagates onward + // (issue #2438). A neutral, W3C-formatted single-entry tracestate keeps ordering trivial. + use opentelemetry::trace::{SpanContext, SpanId, TraceContextExt, TraceFlags, TraceState}; + + let trace_state = TraceState::from_key_value([("key1", "value1")]).unwrap(); + let parent = Context::new().with_remote_span_context(SpanContext::new( + TraceId::from(0x1111_1111_1111_1111_1111_1111_1111_1111u128), + SpanId::from(0x2222_2222_2222_2222u64), + TraceFlags::SAMPLED, + true, + trace_state.clone(), + )); + + let result = CarbideSpanSampler::new(Arc::new(AtomicBool::new(true))).should_sample( + Some(&parent), + TraceId::from(1u128), + "request", + &SpanKind::Server, + &root_marker(), + &[], + ); + + assert_eq!(result.trace_state, trace_state); + } + + /// An OpenTelemetry layer is present (an exporter was built) but the `tracing-enabled` flag is + /// off, so `CarbideSpanSampler` drops every span. Verifies what NICo forwards on egress in that + /// state: does an inbound trace-id still reach a downstream service even though NICo records + /// nothing? + /// + /// Inbound and egress headers are built and asserted as raw `traceparent` strings (the W3C wire + /// format), so this test relies only on the propagation entry points NICo actually calls. + #[test] + fn egress_forwards_trace_id_when_exporter_on_but_flag_off() { + use opentelemetry::trace::TracerProvider; + use opentelemetry_sdk::propagation::TraceContextPropagator; + use opentelemetry_sdk::trace::SdkTracerProvider; + use trace_propagation::{inject_current_context, set_span_parent_from_headers}; + use tracing_subscriber::prelude::*; + + global::set_text_map_propagator(TraceContextPropagator::new()); + + // OTel layer present ("exporter on"), runtime flag OFF -> the sampler drops everything. + let provider = SdkTracerProvider::builder() + .with_sampler(CarbideSpanSampler::new(Arc::new(AtomicBool::new(false)))) + .build(); + let subscriber = tracing_subscriber::registry() + .with(tracing_opentelemetry::layer().with_tracer(provider.tracer("carbide"))); + + // Inbound: a sampled remote trace as a raw W3C traceparent (final `01` = sampled). + let mut inbound = http::HeaderMap::new(); + inbound.insert( + "traceparent", + "00-11111111111111111111111111111111-2222222222222222-01" + .parse() + .unwrap(), + ); + + let mut egress = http::HeaderMap::new(); + tracing::subscriber::with_default(subscriber, || { + let span = tracing::info_span!("request"); + set_span_parent_from_headers(&span, &inbound); + let _enter = span.enter(); + inject_current_context(&mut egress); + }); + + let out = egress + .get("traceparent") + .expect("egress should still carry a traceparent") + .to_str() + .unwrap(); + // The inbound trace-id IS forwarded downstream even though NICo recorded nothing -- the + // dropped request span still inherits it (with a fresh child span-id)... + assert!( + out.starts_with("00-11111111111111111111111111111111-"), + "egress must forward the inbound trace-id: {out}" + ); + // ...but it goes out marked NOT sampled, because NICo dropped its span. + assert!( + out.ends_with("-00"), + "egress must be marked not-sampled (NICo dropped its span): {out}" + ); + } + /// This test mostly mimics the test setup above and checks whether /// the prometheus opentelemetry stack will only report the most recent /// values for gauges and not cached values that are not important anymore diff --git a/crates/api-core/src/machine_identity/token_exchange.rs b/crates/api-core/src/machine_identity/token_exchange.rs index 7bbfe34573..789cfdcef6 100644 --- a/crates/api-core/src/machine_identity/token_exchange.rs +++ b/crates/api-core/src/machine_identity/token_exchange.rs @@ -45,7 +45,7 @@ struct TokenExchangeHttpResponseBody { /// When `token_endpoint_http_proxy` is set and non-empty, all those requests go through that proxy. pub(crate) fn token_exchange_http_client( token_endpoint_http_proxy: Option<&str>, -) -> Result { +) -> Result { let mut builder = reqwest::Client::builder() .timeout(Duration::from_secs(30)) .redirect(reqwest::redirect::Policy::none()); @@ -57,9 +57,14 @@ pub(crate) fn token_exchange_http_client( })?; builder = builder.proxy(proxy); } - builder + let client = builder .build() - .map_err(|e| CarbideError::internal(format!("token exchange HTTP client: {e}")).into()) + .map_err(|e| CarbideError::internal(format!("token exchange HTTP client: {e}")))?; + // The `reqwest-tracing` middleware injects the current span's W3C trace context into every + // outgoing request (#2438). + Ok(reqwest_middleware::ClientBuilder::new(client) + .with(reqwest_tracing::TracingMiddleware::default()) + .build()) } pub(crate) fn rfc8693_token_exchange_form( @@ -80,7 +85,7 @@ pub(crate) fn rfc8693_token_exchange_form( /// the tenant `token_endpoint` (HTTP POST, `application/x-www-form-urlencoded` body from /// [`rfc8693_token_exchange_form`]) and maps the JSON response to [`MachineIdentityResponse`]. pub(crate) async fn token_exchange_request( - http: &reqwest::Client, + http: &reqwest_middleware::ClientWithMiddleware, token_endpoint: &str, subject_jwt: &str, workload_audiences: &[String], @@ -258,7 +263,9 @@ mod tests { .create_async() .await; - let client = reqwest::Client::new(); + let client = reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + .with(reqwest_tracing::TracingMiddleware::default()) + .build(); let url = format!("{}/token", server.url()); token_exchange_request( &client, @@ -291,7 +298,9 @@ mod tests { .create_async() .await; - let client = reqwest::Client::new(); + let client = reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + .with(reqwest_tracing::TracingMiddleware::default()) + .build(); let url = format!("{}/token", server.url()); let creds = ("foo".to_string(), "bar".to_string()); let out = token_exchange_request(&client, &url, "j", &[], Some(&creds)) diff --git a/crates/api-web/Cargo.toml b/crates/api-web/Cargo.toml index 1d44216e02..b3fe0af7a4 100644 --- a/crates/api-web/Cargo.toml +++ b/crates/api-web/Cargo.toml @@ -38,6 +38,7 @@ carbide-rpc-utils = { path = "../rpc-utils" } carbide-uuid = { path = "../uuid", features = ["sqlx"] } carbide-version = { path = "../version" } config-version = { path = "../config-version", features = ["sqlx"] } +trace-propagation = { path = "../trace-propagation" } # External dependencies. PLEASE KEEP ALPHABETIZED ORDER. ansi-to-html = { workspace = true } diff --git a/crates/api-web/src/auth.rs b/crates/api-web/src/auth.rs index 8ed189480b..446814af6d 100644 --- a/crates/api-web/src/auth.rs +++ b/crates/api-web/src/auth.rs @@ -504,7 +504,9 @@ impl<'c> oauth2::AsyncHttpClient<'c> for AsyncRequestHandlerWithTimeouts<'_> { type Future = Pin>, Self::Error>> + Send + 'c>>; - fn call(&'c self, request: HttpRequest) -> Self::Future { + fn call(&'c self, mut request: HttpRequest) -> Self::Future { + // Inject the issuing span's W3C trace context into the outgoing OAuth2 request (#2438). + trace_propagation::inject_current_context(request.headers_mut()); Box::pin(async move { let response = self.client.execute(request.try_into()?).await?; let mut result = http::Response::builder().status(response.status()); diff --git a/crates/component-manager/Cargo.toml b/crates/component-manager/Cargo.toml index 28295bc9af..8c9242f939 100644 --- a/crates/component-manager/Cargo.toml +++ b/crates/component-manager/Cargo.toml @@ -29,6 +29,7 @@ carbide-rack = { path = "../rack", default-features = false } carbide-redfish = { path = "../redfish" } carbide-secrets = { path = "../secrets" } carbide-uuid = { path = "../uuid" } +trace-propagation = { path = "../trace-propagation" } chrono = { workspace = true } libredfish = { workspace = true } librms = { workspace = true } diff --git a/crates/component-manager/src/nsm.rs b/crates/component-manager/src/nsm.rs index b47f1940f2..dc1ff959ba 100644 --- a/crates/component-manager/src/nsm.rs +++ b/crates/component-manager/src/nsm.rs @@ -6,6 +6,7 @@ use std::collections::HashMap; use mac_address::MacAddress; use model::component_manager::{FirmwareState, NvSwitchComponent, PowerAction}; use tonic::transport::Channel; +use trace_propagation::TraceInjectService; use tracing::instrument; use crate::config::BackendTlsConfig; @@ -19,7 +20,7 @@ use crate::types::parse_mac; #[derive(Debug)] pub struct NsmSwitchBackend { - client: nsm::nv_switch_manager_client::NvSwitchManagerClient, + client: nsm::nv_switch_manager_client::NvSwitchManagerClient>, } impl NsmSwitchBackend { @@ -101,7 +102,7 @@ fn build_registration(ep: &SwitchEndpoint) -> nsm::RegisterNvSwitchRequest { /// registration once NSM includes a correlation key (e.g. BMC MAC) in /// RegisterNVSwitchResponse. async fn register_and_map( - client: &mut nsm::nv_switch_manager_client::NvSwitchManagerClient, + client: &mut nsm::nv_switch_manager_client::NvSwitchManagerClient>, endpoints: &[SwitchEndpoint], ) -> Result<(HashMap, HashMap), ComponentManagerError> { let mut mac_to_uuid: HashMap = HashMap::new(); diff --git a/crates/component-manager/src/psm.rs b/crates/component-manager/src/psm.rs index c95edf3db4..c07243aad8 100644 --- a/crates/component-manager/src/psm.rs +++ b/crates/component-manager/src/psm.rs @@ -4,6 +4,7 @@ use carbide_secrets::credentials::Credentials; use model::component_manager::{FirmwareState, PowerAction, PowerShelfComponent}; use tonic::transport::Channel; +use trace_propagation::TraceInjectService; use tracing::instrument; use crate::config::BackendTlsConfig; @@ -17,7 +18,7 @@ use crate::types::parse_mac; #[derive(Debug)] pub struct PsmPowerShelfBackend { - client: psm::powershelf_manager_client::PowershelfManagerClient, + client: psm::powershelf_manager_client::PowershelfManagerClient>, } impl PsmPowerShelfBackend { @@ -73,7 +74,9 @@ fn mac_strings(endpoints: &[PowerShelfEndpoint]) -> Vec { /// registration is primarily about ensuring PSM knows about the device and /// has credentials. async fn register_with_psm( - client: &mut psm::powershelf_manager_client::PowershelfManagerClient, + client: &mut psm::powershelf_manager_client::PowershelfManagerClient< + TraceInjectService, + >, endpoints: &[PowerShelfEndpoint], ) -> Result<(), ComponentManagerError> { let reqs: Vec = endpoints diff --git a/crates/component-manager/src/tls.rs b/crates/component-manager/src/tls.rs index c49c84e932..1efdf3ff86 100644 --- a/crates/component-manager/src/tls.rs +++ b/crates/component-manager/src/tls.rs @@ -2,11 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 use tonic::transport::{Certificate, Channel, ClientTlsConfig, Identity}; +use trace_propagation::TraceInjectService; use crate::config::BackendTlsConfig; use crate::error::ComponentManagerError; -/// Build a tonic `Channel` to `url`, optionally configured with mTLS. +/// Build a tonic `Channel` to `url`, optionally configured with mTLS, wrapped so every request it +/// sends carries the current span's W3C trace context (issue #2438). Because the wrap lives here, +/// any backend built on top of `build_channel` gets trace propagation automatically. /// /// When `tls` is `None`, a plain-text channel is returned (suitable for /// local development or in-cluster plaintext). When `Some`, the channel @@ -17,7 +20,7 @@ pub async fn build_channel( url: &str, tls: Option<&BackendTlsConfig>, service_label: &str, -) -> Result { +) -> Result, ComponentManagerError> { let endpoint = Channel::from_shared(url.to_owned()).map_err(|e| { ComponentManagerError::InvalidArgument(format!("invalid {service_label} URL: {e}")) })?; @@ -68,5 +71,5 @@ pub async fn build_channel( None => endpoint.connect().await?, }; - Ok(channel) + Ok(TraceInjectService::new(channel)) } diff --git a/crates/firmware/Cargo.toml b/crates/firmware/Cargo.toml index dfb41aebf7..62faa68954 100644 --- a/crates/firmware/Cargo.toml +++ b/crates/firmware/Cargo.toml @@ -22,6 +22,8 @@ futures-util = { workspace = true } hex = { workspace = true } md5 = { workspace = true } reqwest = { workspace = true } +reqwest-middleware = { workspace = true } +reqwest-tracing = { workspace = true } serde = { workspace = true } tokio = { workspace = true } toml = { workspace = true } diff --git a/crates/firmware/src/downloader.rs b/crates/firmware/src/downloader.rs index 09e6e46cc8..a9f46d3da5 100644 --- a/crates/firmware/src/downloader.rs +++ b/crates/firmware/src/downloader.rs @@ -24,7 +24,7 @@ use std::time::Duration; use eyre::{Report, WrapErr, eyre}; use futures_util::StreamExt; -use reqwest::Client; +use reqwest_middleware::ClientWithMiddleware as Client; use tokio::fs::File; #[derive(Clone, Debug)] @@ -94,7 +94,13 @@ impl FirmwareDownloader { state.downloading.insert(filename_string.clone()); if state.client.is_none() { - state.client = Some(Client::new()); + // The `reqwest-tracing` middleware injects the current span's W3C trace context into + // every outgoing request (#2438). + state.client = Some( + reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + .with(reqwest_tracing::TracingMiddleware::default()) + .build(), + ); } let filename = filename.to_path_buf(); diff --git a/crates/libmlx/Cargo.toml b/crates/libmlx/Cargo.toml index a43b07e3ef..e47a41949e 100644 --- a/crates/libmlx/Cargo.toml +++ b/crates/libmlx/Cargo.toml @@ -49,6 +49,8 @@ prettytable-rs = { workspace = true } # Async / networking tokio = { features = ["full"], workspace = true } reqwest = { features = ["rustls"], workspace = true } +reqwest-middleware = { workspace = true } +reqwest-tracing = { workspace = true } # Error handling thiserror = { workspace = true } diff --git a/crates/libmlx/src/firmware/source.rs b/crates/libmlx/src/firmware/source.rs index 92681b2700..af334a8ba4 100644 --- a/crates/libmlx/src/firmware/source.rs +++ b/crates/libmlx/src/firmware/source.rs @@ -194,8 +194,11 @@ async fn resolve_http( tracing::debug!(credential_type = %credential_type_name(creds), "Using HTTP credentials"); } - // Build the HTTP request with optional credentials. - let client = reqwest::Client::new(); + // Build the HTTP request with optional credentials. The `reqwest-tracing` middleware injects + // the current span's W3C trace context into the outgoing request (#2438). + let client = reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + .with(reqwest_tracing::TracingMiddleware::default()) + .build(); let mut request = client.get(url); if let Some(creds) = credentials { diff --git a/crates/libnmxc/Cargo.toml b/crates/libnmxc/Cargo.toml index 1230e45576..437c0648fe 100644 --- a/crates/libnmxc/Cargo.toml +++ b/crates/libnmxc/Cargo.toml @@ -28,6 +28,7 @@ path = "src/lib.rs" [dependencies] async-trait = { workspace = true } +trace-propagation = { path = "../trace-propagation" } http = { workspace = true } tokio = { workspace = true, features = [ "fs", diff --git a/crates/libnmxc/src/lib.rs b/crates/libnmxc/src/lib.rs index 437464eceb..401965bcba 100644 --- a/crates/libnmxc/src/lib.rs +++ b/crates/libnmxc/src/lib.rs @@ -161,7 +161,8 @@ impl NmxcClientPool { pub async fn create_client(&self, endpoint: Endpoint) -> Result, NmxcError> { let channel = self.connect(&endpoint).await?; - let client = NmxControllerClient::new(channel).max_decoding_message_size(usize::MAX); + let client = NmxControllerClient::new(trace_propagation::TraceInjectService::new(channel)) + .max_decoding_message_size(usize::MAX); let nmxc = NmxcApi::new(client); Ok(Box::new(nmxc)) } diff --git a/crates/libnmxc/src/nmxc_api.rs b/crates/libnmxc/src/nmxc_api.rs index b34bd01804..9bf9caafb0 100644 --- a/crates/libnmxc/src/nmxc_api.rs +++ b/crates/libnmxc/src/nmxc_api.rs @@ -15,6 +15,7 @@ * limitations under the License. */ use tonic::transport::Channel; +use trace_propagation::TraceInjectService; use crate::nmxc_model::nmx_controller_client::NmxControllerClient; use crate::response::check_server_header_success; @@ -35,11 +36,11 @@ fn default_context() -> nmxc_model::Context { } pub struct NmxcApi { - client: NmxControllerClient, + client: NmxControllerClient>, } impl NmxcApi { - pub fn new(client: NmxControllerClient) -> Self { + pub fn new(client: NmxControllerClient>) -> Self { Self { client } } } diff --git a/crates/libnmxm/Cargo.toml b/crates/libnmxm/Cargo.toml index 774784e26a..cd465b15b5 100644 --- a/crates/libnmxm/Cargo.toml +++ b/crates/libnmxm/Cargo.toml @@ -40,6 +40,8 @@ reqwest = { workspace = true, default-features = false, features = [ "socks", "cookies", ] } +reqwest-middleware = { workspace = true } +reqwest-tracing = { workspace = true } tracing = { workspace = true } thiserror = { workspace = true } anyhow = { workspace = true } diff --git a/crates/libnmxm/src/lib.rs b/crates/libnmxm/src/lib.rs index 97868dc72b..afd47d016d 100644 --- a/crates/libnmxm/src/lib.rs +++ b/crates/libnmxm/src/lib.rs @@ -22,7 +22,8 @@ use std::string::String; use std::time::Duration; use reqwest::header::{ACCEPT, CONTENT_TYPE, HeaderValue, USER_AGENT}; -use reqwest::{Client as HttpClient, ClientBuilder, Method, StatusCode}; +use reqwest::{ClientBuilder, Method, StatusCode}; +use reqwest_middleware::ClientWithMiddleware as HttpClient; use serde::Serialize; use serde::de::DeserializeOwned; use tracing::debug; @@ -35,7 +36,10 @@ const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30); #[derive(thiserror::Error, Debug)] pub enum NmxmApiError { #[error("Network error talking to NMX-M server at {url}. {source}")] - NetworkError { url: String, source: reqwest::Error }, + NetworkError { + url: String, + source: reqwest_middleware::Error, + }, #[error("HTTP {status_code} at {url}: {response_body}")] HTTPErrorCode { @@ -120,6 +124,12 @@ impl NmxmClientPoolBuilder { .timeout(self.timeout) .build()?; + // The `reqwest-tracing` middleware injects the current span's W3C trace context into every + // outgoing request (#2438). + let client = reqwest_middleware::ClientBuilder::new(client) + .with(reqwest_tracing::TracingMiddleware::default()) + .build(); + let pool = NmxmClientPool { client }; Ok(pool) diff --git a/crates/mqttea/Cargo.toml b/crates/mqttea/Cargo.toml index bb66786165..febce26c19 100644 --- a/crates/mqttea/Cargo.toml +++ b/crates/mqttea/Cargo.toml @@ -26,6 +26,7 @@ name = "mqttea" path = "src/lib.rs" [dependencies] +trace-propagation = { path = "../trace-propagation" } tokio = { features = ["full"], workspace = true } rumqttc = { workspace = true } prost = { workspace = true } diff --git a/crates/mqttea/src/auth/oauth2_provider.rs b/crates/mqttea/src/auth/oauth2_provider.rs index 212a34b339..7c043dbfe3 100644 --- a/crates/mqttea/src/auth/oauth2_provider.rs +++ b/crates/mqttea/src/auth/oauth2_provider.rs @@ -298,7 +298,9 @@ impl<'c> AsyncHttpClient<'c> for AsyncHttpClientWrapper<'_> { type Error = reqwest::Error; type Future = Pin> + Send + 'c>>; - fn call(&'c self, request: HttpRequest) -> Self::Future { + fn call(&'c self, mut request: HttpRequest) -> Self::Future { + // Inject the issuing span's W3C trace context into the outgoing OAuth2 request (#2438). + trace_propagation::inject_current_context(request.headers_mut()); Box::pin(async move { let response = self.client.execute(request.try_into()?).await?; let status = response.status(); diff --git a/crates/nras/Cargo.toml b/crates/nras/Cargo.toml index 5b13550478..613e29daa6 100644 --- a/crates/nras/Cargo.toml +++ b/crates/nras/Cargo.toml @@ -32,6 +32,8 @@ reqwest = { default-features = false, features = [ "rustls", "stream", ], workspace = true } +reqwest-middleware = { workspace = true } +reqwest-tracing = { workspace = true } serde.workspace = true serde_json.workspace = true thiserror.workspace = true diff --git a/crates/nras/src/client.rs b/crates/nras/src/client.rs index fcaf349326..0979e18bd8 100644 --- a/crates/nras/src/client.rs +++ b/crates/nras/src/client.rs @@ -42,7 +42,9 @@ pub trait VerifierClient: std::fmt::Debug + Send + Sync + 'static { #[derive(Debug)] pub struct NrasVerifierClient { config: crate::Config, - http_client: reqwest::Client, + // A `reqwest-tracing` middleware client: it injects the current span's W3C trace context into + // every outgoing request (issue #2438), so there is no per-call header injection here. + http_client: reqwest_middleware::ClientWithMiddleware, } // TODO: add config which would allow configuring the URL paths for gpu, dpu, cx7 @@ -50,7 +52,9 @@ impl NrasVerifierClient { pub fn new_with_config(config: &crate::Config) -> NrasVerifierClient { NrasVerifierClient { config: config.clone(), - http_client: reqwest::Client::new(), + http_client: reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + .with(reqwest_tracing::TracingMiddleware::default()) + .build(), } } } @@ -63,8 +67,7 @@ impl VerifierClient for NrasVerifierClient { device_attestation_info: &DeviceAttestationInfo, ) -> Result { // prepare the request - // submit to NRAS - + // submit to NRAS (the http client propagates W3C trace context via middleware) let att_response = self .http_client .post(format!( diff --git a/crates/nras/src/lib.rs b/crates/nras/src/lib.rs index eb0f7783ee..9e8866725a 100644 --- a/crates/nras/src/lib.rs +++ b/crates/nras/src/lib.rs @@ -73,6 +73,12 @@ impl From for NrasError { } } +impl From for NrasError { + fn from(value: reqwest_middleware::Error) -> NrasError { + NrasError::Communication(format!("Communication error: {}", value)) + } +} + type Evidence = String; type DeviceCertificate = String; diff --git a/crates/rpc/Cargo.toml b/crates/rpc/Cargo.toml index 569e6552df..5f8621c995 100644 --- a/crates/rpc/Cargo.toml +++ b/crates/rpc/Cargo.toml @@ -44,6 +44,7 @@ carbide-api-model = { path = "../api-model", optional = true } carbide-http-connector = { path = "../http-connector" } carbide-ipxe-renderer = { path = "../ipxe-renderer" } carbide-tls = { path = "../tls" } +trace-propagation = { path = "../trace-propagation" } carbide-health-report = { path = "../health-report" } carbide-libmlx-model = { path = "../libmlx-model" } carbide-measured-boot = { path = "../measured-boot", default-features = false } diff --git a/crates/rpc/src/forge_tls_client.rs b/crates/rpc/src/forge_tls_client.rs index 31336ec1be..cacd9d0d66 100644 --- a/crates/rpc/src/forge_tls_client.rs +++ b/crates/rpc/src/forge_tls_client.rs @@ -443,8 +443,10 @@ impl<'a> ForgeTlsClient<'a> { // We never make more than a single connection to carbide at a time. .pool_max_idle_per_host(2) .timer(TokioTimer::new()) - .build(connector) - .boxed_clone(); + .build(connector); + // Inject the issuing span's W3C trace context into every request this client sends + // (issue #2438). Wrapping before `boxed_clone` keeps the erased `BoxCloneService` type. + let hyper_client = trace_propagation::TraceInjectService::new(hyper_client).boxed_clone(); let mut forge_client = ForgeClient::with_origin(hyper_client, uri); @@ -640,8 +642,10 @@ impl<'a> ForgeTlsClient<'a> { // We never make more than a single connection to carbide at a time. .pool_max_idle_per_host(2) .timer(TokioTimer::new()) - .build(connector) - .boxed_clone(); + .build(connector); + // Inject the issuing span's W3C trace context into every request this client sends + // (issue #2438). Wrapping before `boxed_clone` keeps the erased `BoxCloneService` type. + let hyper_client = trace_propagation::TraceInjectService::new(hyper_client).boxed_clone(); let mut nmx_c_client = NmxControllerClient::with_origin(hyper_client, uri); diff --git a/crates/trace-propagation/Cargo.toml b/crates/trace-propagation/Cargo.toml new file mode 100644 index 0000000000..2e791dd884 --- /dev/null +++ b/crates/trace-propagation/Cargo.toml @@ -0,0 +1,40 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +[package] +name = "trace-propagation" +version = "0.0.0" +edition.workspace = true +license.workspace = true +authors.workspace = true +description = "W3C trace-context propagation helpers (extract on ingress, inject on egress)" + +[dependencies] +http = { workspace = true } +opentelemetry = { workspace = true } +opentelemetry-http = { workspace = true } +tower = { workspace = true } +tracing = { workspace = true } +tracing-opentelemetry = { workspace = true } + +[dev-dependencies] +opentelemetry_sdk = { workspace = true, features = ["testing"] } +tokio = { workspace = true, features = ["macros", "rt"] } +tower = { workspace = true, features = ["util"] } +tracing-subscriber = { workspace = true } + +[lints] +workspace = true diff --git a/crates/trace-propagation/src/lib.rs b/crates/trace-propagation/src/lib.rs new file mode 100644 index 0000000000..09bdf041fd --- /dev/null +++ b/crates/trace-propagation/src/lib.rs @@ -0,0 +1,398 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! W3C Trace Context propagation for HTTP and gRPC service boundaries. +//! +//! In a distributed system one request hops through many services; to see it as a single end-to-end +//! trace, each service must carry the W3C `traceparent`/`tracestate` headers across its network +//! edges — read them off incoming requests, write them onto outgoing ones. This crate is the thin +//! glue for that, on top of the standard OpenTelemetry propagator (installed once at startup) and +//! [`opentelemetry_http`]'s header adapters; there is no hand-rolled `traceparent` parsing, and one +//! code path serves both protocols (each exposes its headers as an `http::HeaderMap`). +//! +//! - **Ingress** — call [`set_span_parent_from_headers`] on the inbound request; it makes the +//! caller's span the parent of the local request span, so this service's spans join that trace. +//! - **Egress** — by client type: +//! - **tower clients** (tonic gRPC `Channel` or hyper client) — wrap in [`TraceInjectService`] to +//! stamp every request automatically. +//! - **non-tower clients** (`reqwest`): +//! - normally, use the off-the-shelf `reqwest-tracing` middleware (a separate crate). +//! - if you only hold the already-built request (so it can't go through a middleware client), +//! call [`inject_current_context`] on its header map. +//! +//! [`extract_context`] (read) and [`inject_context`] (write) are the underlying primitives, +//! exposed for direct use when the helpers above don't fit. +//! +//! When no propagator has been installed, OpenTelemetry's default is a no-op propagator, so every +//! helper here is a no-op — safe to wire in unconditionally. + +use std::task::{Context as TaskContext, Poll}; + +use opentelemetry::trace::TraceContextExt; +use opentelemetry::{Context, global}; +use opentelemetry_http::{HeaderExtractor, HeaderInjector}; +use tower::Service; +use tracing_opentelemetry::OpenTelemetrySpanExt; + +// --- Ingress --- + +/// Extract a remote [`Context`] from inbound request headers using the globally configured +/// text-map propagator. When the headers carry no valid trace context, there is nothing to copy +/// and the returned context has no valid span. +#[must_use = "the extracted context has no effect unless it is used (e.g. as a span parent)"] +pub fn extract_context(headers: &http::HeaderMap) -> Context { + global::get_text_map_propagator(|propagator| propagator.extract(&HeaderExtractor(headers))) +} + +/// Extract the inbound trace context from `headers` and, when it carries a valid span, make it the +/// parent of `span`. No-op when the headers carry no valid inbound context: an absent or malformed +/// `traceparent` leaves `span`'s parent unchanged. +pub fn set_span_parent_from_headers(span: &tracing::Span, headers: &http::HeaderMap) { + let parent_cx = extract_context(headers); + if parent_cx.span().span_context().is_valid() { + // `set_parent` is best-effort and returns a Result: it can fail when no OpenTelemetry + // layer is active, when the span has already been started, or when the span is filtered + // out by a tracing filter. In each case there is nothing to link the remote context to, + // so the error is intentionally ignored. + let _ = span.set_parent(parent_cx); + } +} + +// --- Egress --- + +/// Inject `cx` into outbound request headers using the globally configured text-map propagator. +/// When `cx` has no valid span context, there is no trace context to copy. +pub fn inject_context(cx: &Context, headers: &mut http::HeaderMap) { + global::get_text_map_propagator(|propagator| { + propagator.inject_context(cx, &mut HeaderInjector(headers)); + }); +} + +/// Inject the *current* tracing span's trace context into outbound request headers. +/// +/// Reads the OpenTelemetry context attached to [`tracing::Span::current`]; when there is no active +/// span / valid context this writes nothing, leaving the request unchanged. +pub fn inject_current_context(headers: &mut http::HeaderMap) { + inject_context(&tracing::Span::current().context(), headers); +} + +/// A [`tower::Service`] that injects the current trace context into each outbound request's headers, +/// then defers to the inner service unchanged. +/// +/// Wrap an HTTP/gRPC client service (hyper client, tonic `Channel`, …) with this so each request +/// carries `traceparent`/`tracestate` derived from the span that issued it. +#[derive(Clone, Debug)] +pub struct TraceInjectService { + inner: S, +} + +impl TraceInjectService { + pub fn new(inner: S) -> Self { + Self { inner } + } +} + +impl Service> for TraceInjectService +where + S: Service>, +{ + type Response = S::Response; + type Error = S::Error; + type Future = S::Future; + + fn poll_ready(&mut self, cx: &mut TaskContext<'_>) -> Poll> { + self.inner.poll_ready(cx) + } + + fn call(&mut self, mut req: http::Request) -> Self::Future { + inject_current_context(req.headers_mut()); + self.inner.call(req) + } +} + +#[cfg(test)] +mod tests { + use opentelemetry::trace::{ + SpanContext, SpanId, TraceContextExt, TraceFlags, TraceId, TraceState, TracerProvider, + }; + use opentelemetry_sdk::propagation::TraceContextPropagator; + use opentelemetry_sdk::trace::SdkTracerProvider; + use tower::ServiceExt; + use tracing_subscriber::prelude::*; + + use super::*; + + fn install_w3c_propagator() { + global::set_text_map_propagator(TraceContextPropagator::new()); + } + + /// Run `f` with a tracing subscriber that has an OpenTelemetry layer installed, so spans + /// created inside it carry a real OTel context (what egress injection reads from). + fn with_otel_subscriber(f: impl FnOnce() -> R) -> R { + let provider = SdkTracerProvider::builder().build(); + let tracer = provider.tracer("trace-propagation-test"); + let subscriber = + tracing_subscriber::registry().with(tracing_opentelemetry::layer().with_tracer(tracer)); + tracing::subscriber::with_default(subscriber, f) + } + + fn context_with_span(trace: u128, span: u64, sampled: bool) -> Context { + let sc = SpanContext::new( + TraceId::from(trace), + SpanId::from(span), + if sampled { + TraceFlags::SAMPLED + } else { + TraceFlags::default() + }, + true, + TraceState::default(), + ); + Context::new().with_remote_span_context(sc) + } + + // --- Ingress --- + + #[test] + fn extract_returns_invalid_context_without_headers() { + install_w3c_propagator(); + let cx = extract_context(&http::HeaderMap::new()); + assert!(!cx.span().span_context().is_valid()); + } + + #[test] + fn extract_ignores_malformed_traceparent() { + install_w3c_propagator(); + let mut headers = http::HeaderMap::new(); + headers.insert("traceparent", "not-a-valid-traceparent".parse().unwrap()); + let cx = extract_context(&headers); + assert!(!cx.span().span_context().is_valid()); + } + + #[test] + fn set_span_parent_links_span_to_inbound_trace() { + install_w3c_propagator(); + let trace = 0x42u128; + let mut headers = http::HeaderMap::new(); + inject_context(&context_with_span(trace, 0x55, true), &mut headers); + + with_otel_subscriber(|| { + let span = tracing::info_span!("request"); + set_span_parent_from_headers(&span, &headers); + // The local span now belongs to the inbound trace. + assert_eq!( + span.context().span().span_context().trace_id(), + TraceId::from(trace) + ); + }); + } + + #[test] + fn set_span_parent_is_noop_without_inbound_trace() { + install_w3c_propagator(); + let inbound = 0x42u128; + with_otel_subscriber(|| { + let span = tracing::info_span!("request"); + // No inbound headers: must not adopt any specific upstream trace. + set_span_parent_from_headers(&span, &http::HeaderMap::new()); + assert_ne!( + span.context().span().span_context().trace_id(), + TraceId::from(inbound) + ); + }); + } + + // --- Egress --- + + #[test] + fn inject_then_extract_roundtrips() { + install_w3c_propagator(); + let trace = 0x42u128; + let span = 0x55u64; + let cx = context_with_span(trace, span, true); + + let mut headers = http::HeaderMap::new(); + inject_context(&cx, &mut headers); + + // The standard W3C header is present... + assert!(headers.contains_key("traceparent")); + // ...and round-trips back to the same trace / span ids. + let extracted = extract_context(&headers); + let sc = extracted.span().span_context().clone(); + assert!(sc.is_valid()); + assert_eq!(sc.trace_id(), TraceId::from(trace)); + assert_eq!(sc.span_id(), SpanId::from(span)); + assert!(sc.is_sampled()); + } + + #[test] + fn inject_writes_nothing_without_valid_context() { + install_w3c_propagator(); + let mut headers = http::HeaderMap::new(); + inject_context(&Context::new(), &mut headers); + assert!(!headers.contains_key("traceparent")); + } + + #[test] + fn inject_current_context_uses_active_span() { + install_w3c_propagator(); + with_otel_subscriber(|| { + let span = tracing::info_span!("egress"); + let _enter = span.enter(); + + let mut headers = http::HeaderMap::new(); + inject_current_context(&mut headers); + assert!(headers.contains_key("traceparent")); + }); + } + + #[test] + fn inject_current_context_writes_nothing_without_active_span() { + install_w3c_propagator(); + // No OpenTelemetry layer / active span: egress must not emit a traceparent. + let mut headers = http::HeaderMap::new(); + inject_current_context(&mut headers); + assert!(!headers.contains_key("traceparent")); + } + + #[tokio::test] + async fn inject_layer_adds_traceparent_to_forwarded_request() { + install_w3c_propagator(); + let inner = tower::service_fn(|req: http::Request<()>| async move { + Ok::<_, std::convert::Infallible>(req) + }); + let mut svc = TraceInjectService::new(inner); + svc.ready().await.expect("inner service is always ready"); + + // The layer injects synchronously in `call`, so the span need only be active there. + let future = with_otel_subscriber(|| { + let span = tracing::info_span!("egress"); + let _enter = span.enter(); + let req = http::Request::builder().body(()).unwrap(); + svc.call(req) + }); + let forwarded = future.await.unwrap(); + assert!(forwarded.headers().contains_key("traceparent")); + } + + // --- End-to-end --- + + /// End-to-end through a real OpenTelemetry pipeline with an in-memory span sink: an inbound + /// `traceparent` (ingress) becomes the parent of the request span, and an outbound request made + /// within that span (egress) carries the same trace onward. The exported span proves the link. + /// Deterministic: `AlwaysOn` sampler + `SimpleSpanProcessor` (synchronous export on span end). + #[test] + fn ingress_then_egress_round_trips_through_a_recorded_span() { + use opentelemetry_sdk::trace::{InMemorySpanExporter, Sampler, SdkTracerProvider}; + + install_w3c_propagator(); + + let exporter = InMemorySpanExporter::default(); + let provider = SdkTracerProvider::builder() + .with_sampler(Sampler::AlwaysOn) + .with_simple_exporter(exporter.clone()) + .build(); + let tracer = provider.tracer("trace-propagation-roundtrip"); + let subscriber = + tracing_subscriber::registry().with(tracing_opentelemetry::layer().with_tracer(tracer)); + + // A known trace arriving at the service's ingress boundary from an upstream caller. + let inbound_trace = 0x42u128; + let inbound_span = 0x55u64; + let mut inbound_headers = http::HeaderMap::new(); + inject_context( + &context_with_span(inbound_trace, inbound_span, true), + &mut inbound_headers, + ); + + let mut egress_headers = http::HeaderMap::new(); + tracing::subscriber::with_default(subscriber, || { + let request_span = tracing::info_span!("request"); + // INGRESS: adopt the inbound trace as the request span's parent. + set_span_parent_from_headers(&request_span, &inbound_headers); + let _entered = request_span.enter(); + // EGRESS: an outbound call issued within the request injects the current context. + inject_current_context(&mut egress_headers); + // request_span closes at end of scope -> exported synchronously by SimpleSpanProcessor. + }); + + // EGRESS: the outbound traceparent continues the inbound trace. + assert_eq!( + extract_context(&egress_headers) + .span() + .span_context() + .trace_id(), + TraceId::from(inbound_trace), + "egress traceparent should carry the inbound trace id" + ); + + // INGRESS: the recorded request span belongs to the inbound trace, parented to the caller. + let spans = exporter.get_finished_spans().expect("finished spans"); + let request = spans + .iter() + .find(|s| s.name == "request") + .expect("request span should have been exported"); + assert_eq!( + request.span_context.trace_id(), + TraceId::from(inbound_trace) + ); + assert_eq!(request.parent_span_id, SpanId::from(inbound_span)); + } + + /// `tracestate` (the W3C vendor list alongside `traceparent`) passes through ingress to egress + /// unchanged — this crate adds no entry of its own. + #[test] + fn tracestate_passes_through_ingress_then_egress() { + use opentelemetry_sdk::trace::{Sampler, SdkTracerProvider}; + + install_w3c_propagator(); + + let provider = SdkTracerProvider::builder() + .with_sampler(Sampler::AlwaysOn) + .build(); + let tracer = provider.tracer("trace-propagation-tracestate"); + let subscriber = + tracing_subscriber::registry().with(tracing_opentelemetry::layer().with_tracer(tracer)); + + // Inbound carries a traceparent plus a neutral, W3C-formatted two-entry tracestate. + let inbound_trace = 0x42u128; + let mut inbound_headers = http::HeaderMap::new(); + inject_context( + &context_with_span(inbound_trace, 0x55, true), + &mut inbound_headers, + ); + inbound_headers.insert("tracestate", "key1=value1,key2=value2".parse().unwrap()); + + let mut egress_headers = http::HeaderMap::new(); + tracing::subscriber::with_default(subscriber, || { + let request_span = tracing::info_span!("request"); + set_span_parent_from_headers(&request_span, &inbound_headers); + let _entered = request_span.enter(); + inject_current_context(&mut egress_headers); + }); + + let egress_tracestate = egress_headers + .get("tracestate") + .expect("egress should carry tracestate") + .to_str() + .unwrap(); + assert!( + egress_tracestate.contains("key1=value1") && egress_tracestate.contains("key2=value2"), + "egress tracestate should preserve both inbound entries, got: {egress_tracestate}" + ); + } +} diff --git a/crates/trace-propagation/tests/no_global_propagator.rs b/crates/trace-propagation/tests/no_global_propagator.rs new file mode 100644 index 0000000000..005555be0a --- /dev/null +++ b/crates/trace-propagation/tests/no_global_propagator.rs @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +//! Verifies the crate's helpers stay safe no-ops when no OpenTelemetry **global text-map +//! propagator** is installed — e.g. a component that uses this crate but never configures +//! tracing, or a process before its startup wiring runs. +//! +//! This is its own test binary (separate process), so — unlike the in-lib unit tests, which call +//! `set_text_map_propagator` — nothing installs a propagator here and the global stays at +//! OpenTelemetry's no-op default. The tests only *read* the global propagator, never set it, so they +//! are safe to run concurrently. +//! +//! **Do not add a test to this file that installs a propagator:** the global is process-wide, so it +//! would leak into the other tests. The [`assert_no_global_propagator`] guard at the top of each test +//! fails fast if that ever happens. + +use opentelemetry::global; +use tower::{Service, ServiceExt}; +use trace_propagation::{ + TraceInjectService, extract_context, inject_context, inject_current_context, + set_span_parent_from_headers, +}; + +const SAMPLE_TRACEPARENT: &str = "00-1111111111111111aaaaaaaaaaaaaaaa-2222222222222222-01"; + +fn headers_with_traceparent() -> http::HeaderMap { + let mut headers = http::HeaderMap::new(); + headers.insert("traceparent", SAMPLE_TRACEPARENT.parse().unwrap()); + headers +} + +/// Precondition for this file: no global text-map propagator is installed in this test binary. The +/// default no-op propagator advertises no fields, so any installed propagator reports at least one. +fn assert_no_global_propagator() { + let has_fields = global::get_text_map_propagator(|p| p.fields().next().is_some()); + assert!( + !has_fields, + "a global text-map propagator is installed in this test binary; this file's tests require none", + ); +} + +#[test] +fn ingress_and_egress_are_noops_without_a_global_propagator() { + assert_no_global_propagator(); + + // Ingress: parenting a span from real inbound headers must not panic, and (no propagator) must + // not adopt a remote parent. + let inbound = headers_with_traceparent(); + set_span_parent_from_headers(&tracing::info_span!("ingress"), &inbound); // must not panic + + // Egress: injecting writes nothing without an active trace / propagator. + let mut outbound = http::HeaderMap::new(); + inject_context(&extract_context(&inbound), &mut outbound); + inject_current_context(&mut outbound); + assert!( + outbound.is_empty(), + "no trace/propagator -> nothing injected" + ); + + // ...and still no propagator, so the no-ops above genuinely ran under that condition throughout. + assert_no_global_propagator(); +} + +#[tokio::test] +async fn trace_inject_service_forwards_unchanged_without_a_global_propagator() { + assert_no_global_propagator(); + + // Wrapping a brand-new client in TraceInjectService and driving a request through it must not + // panic when there's no propagator/active span; it forwards the request untouched. + let inner = tower::service_fn(|req: http::Request<()>| async move { + Ok::<_, std::convert::Infallible>(req) + }); + let mut svc = TraceInjectService::new(inner); + let req = http::Request::builder().body(()).unwrap(); + let forwarded = svc.ready().await.unwrap().call(req).await.unwrap(); + assert!( + !forwarded.headers().contains_key("traceparent"), + "no trace -> no header injected" + ); + + // The egress no-op above can't itself detect an installed propagator (no active span), so + // confirm the precondition still held for the whole test. + assert_no_global_propagator(); +} diff --git a/docs/observability/tracing.md b/docs/observability/tracing.md index 388a7d470a..9364a6c48f 100644 --- a/docs/observability/tracing.md +++ b/docs/observability/tracing.md @@ -28,6 +28,10 @@ costs. ``` Leaving the OTLP endpoint configured while tracing is disabled costs almost nothing. - Transport is **OTLP/gRPC, plaintext**; nico-api cannot do OTLP/HTTP or originate TLS +- nico-api **propagates W3C trace context** at its network boundaries: it reads `traceparent`/ + `tracestate` from inbound REST and gRPC requests and continues that trace, and injects the same + headers into its outbound requests. Propagation links traces across services but does not by itself + enable recording (see [1.6](#16-w3c-trace-context-propagation)). --- @@ -71,15 +75,17 @@ backends, plus the database work underneath them - which maps directly to the EP ### 1.3 How spans are selected (sampler) -nico-api uses a custom `CarbideSpanSampler` wrapped as **`ParentBased`**: +nico-api uses a custom `CarbideSpanSampler`: - A **root span** is recorded only if both are true: - the in-process `tracing_enabled` flag is on, from `[tracing] enabled = true` at startup or from the dynamic `tracing-enabled` setting - - the span's `code.namespace` begins with `carbide::` -- **Child spans inherit the root's decision** (that's what `ParentBased` means), so once a - trace is sampled the whole call tree beneath it is captured - **except tokio spans, which are - always dropped** (they leak and would exhaust memory). + - the span carries the `carbide.trace_root` marker attribute, set explicitly on the request span and a few + deliberate roots (the state-controller reconcile loops, site-explorer) +- **In-process child spans inherit the root's decision**, so once a trace is sampled the whole call tree beneath + it is captured - **except tokio spans, which are always dropped** (they leak and would exhaust memory). +- For a span parented to a **remote** (ingress-extracted) trace, the decision stays local: an inbound `sampled` + flag does not override `tracing_enabled` (see §1.6). - The exporter resource is `service.name = carbide-api`; the tracer is named `carbide`. ### 1.4 How traces leave nico-api @@ -106,6 +112,66 @@ nico-api's: - **Resource / output:** `service.name = carbide-dns`; logs are JSON on stdout (not logfmt). - **Same transport constraints:** OTLP/gRPC, plaintext (`with_tonic`, no `tls` feature) +### 1.6 W3C trace-context propagation + +nico-api accepts and produces **W3C Trace Context** headers (`traceparent`, `tracestate`) at its +network boundaries, so a request already traced by another service stays one trace as it passes +through nico-api. The standard `TraceContextPropagator` is installed once at startup +(`crates/api-core/src/logging/setup.rs`); there is no custom header parsing. + +- **Ingress (REST + gRPC).** The shared per-request layer (`crates/api-core/src/logging/api_logs.rs`) + extracts any inbound `traceparent`/`tracestate` and makes the upstream span the parent of nico-api's + request span. REST and gRPC flow through this single layer, so both are covered. A missing or + malformed `traceparent` leaves the request span a fresh root. +- **Egress.** When nico-api makes an outbound call from within a traced request, it injects the + current `traceparent`/`tracestate` so the downstream service can continue the trace. Covered: + - **gRPC** - Forge and NMX-C (`crates/rpc`), the NSM and power-shelf (PSM) backends + (`crates/component-manager`), and the NMX-C client pool (`crates/libnmxc`), through a shared tower + layer applied to every request. + - **HTTP** - the BMC/Redfish handler, machine-identity token exchange, admin-UI OAuth2, NMX-M, NRAS, + the MQTT OAuth2 token provider, and firmware downloads. +- **Interaction with the enable flag.** `tracing-enabled` stays the master switch for what nico-api + *records*: an inbound `sampled` flag never turns recording on here. When it is on, the inbound + `trace_id` is inherited, so nico-api's spans join the caller's trace. +- **Forwarding vs. recording.** Forwarding the context is separate from recording it, but today both + depend on the exporter being built: + - *Exporter built, tracing off:* records nothing, yet still forwards the inbound `trace_id`/`tracestate` + marked **not sampled** (`sampled=0`). + - *No endpoint configured (exporter not built):* does **not** forward at all, so the trace **breaks** at + this hop. **Known limitation.** +- **Scope.** Trace context only (`traceparent`/`tracestate`). + +### 1.7 Adding a new network client + +Propagation is automatic on ingress but opt-in on egress, so keep this in mind when adding code: + +- **New ingress (a REST route or gRPC method): nothing to do.** Every inbound request flows through the + shared per-request layer (`crates/api-core/src/logging/api_logs.rs`), which extracts the inbound context + for you. +- **New outbound gRPC client (tonic/hyper):** wrap its channel/service with + `trace_propagation::TraceInjectService` at construction — better, build through an existing shared + client that already wraps the transport (see `crates/rpc/src/forge_tls_client.rs`). +- **New outbound HTTP client (`reqwest`):** build it through the `reqwest-tracing` middleware instead of using a + bare `reqwest::Client`. The wrapped client injects the current `traceparent`/`tracestate` into every request + automatically, so there is no per-call code: + + ```rust + let client = reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + .with(reqwest_tracing::TracingMiddleware::default()) + .build(); // -> reqwest_middleware::ClientWithMiddleware, a drop-in for request-building + let resp = client.get(url).send().await?; + ``` + + See `crates/nras/src/client.rs` for a real example. +- **When another crate owns the HTTP call (manual fallback):** if the request is built and sent by code you + don't control — e.g. the `oauth2` client (`crates/api-web/src/auth.rs`), which owns its own `reqwest` + request — inject into that request's headers directly: + + ```rust + trace_propagation::inject_current_context(request.headers_mut()); + ``` +Injection is always a safe no-op when no trace is active, so it is safe to add unconditionally. + --- ## 2. How to enable and disable tracing @@ -321,7 +387,7 @@ which of three states nico-api is in: This is the expensive mode the dev team warns about: -- Because the sampler is `ParentBased`, a sampled root span pulls in its **entire child subtree** +- Because a span's in-process children inherit its sampling decision, a sampled root span pulls in its **entire child subtree** (the component-manager, controller, and DB spans beneath it). A single traced operation can therefore produce many spans. - Costs land in several places: extra **CPU and memory** on nico-api, added **latency** on @@ -377,7 +443,7 @@ annotation involved for traces | Sidecar injected but still no traces | Endpoint not set, or points somewhere other than `localhost:4317` | Set `[tracing] otlp_endpoint = "http://localhost:4317"` or `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: http://localhost:4317` on nico-api | | Traces reach the collector but not the backend | Collector exporter endpoint/TLS wrong | Check the exporter config; for remote backends configure TLS/mTLS on the collector | | Sudden resource/latency spike on nico-api | Tracing left on | `nico-admin-cli set tracing-enabled false`, or set `[tracing] enabled = false` and roll nico-api if runtime changes are disabled | -| Spans arrive but request trees look sparse | The `carbide::` root-span nuance | Verify where the root span originates on a live environment | +| Spans arrive but request trees look sparse | Only `carbide.trace_root`-marked spans start a recorded trace (§1.3) | Confirm the operation starts at a marked root span | ---