diff --git a/crates/scout/src/deprovision/scrabbing.rs b/crates/scout/src/deprovision/scrabbing.rs index c9a482dc45..e38261a10d 100644 --- a/crates/scout/src/deprovision/scrabbing.rs +++ b/crates/scout/src/deprovision/scrabbing.rs @@ -23,13 +23,12 @@ use carbide_uuid::machine::MachineId; use regex::Regex; use scout::CarbideClientError; use serde::Deserialize; -use smbioslib::SMBiosSystemInformation; use tracing::Instrument; use crate::cfg::Options; use crate::client::create_forge_client; use crate::deprovision::cmdrun; -use crate::{CarbideClientResult, IN_QEMU_VM}; +use crate::{CarbideClientResult, IN_QEMU_VM, platform}; fn check_memory_overwrite_efi_var() -> Result<(), CarbideClientError> { let name = match efivar::efi::Variable::from_str( @@ -1093,22 +1092,9 @@ async fn do_cleanup(machine_id: &MachineId) -> CarbideClientResult bool { - match smbioslib::table_load_from_device() { - Ok(data) => data.any(|sys_info: SMBiosSystemInformation| { - !sys_info - .product_name() - .to_string() - .to_lowercase() - .contains("bluefield") - }), - Err(_err) => true, - } -} - pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClientResult<()> { tracing::info!("full deprovision starts."); - if !is_host() { + if !platform::is_host() { tracing::info!("full deprovision skipped, we are not running on a host."); // do not send API cleanup_machine_completed return Ok(()); @@ -1122,7 +1108,7 @@ pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClie } pub async fn run_no_api(tpm_path: &str) -> Result<(), CarbideClientError> { - if !is_host() { + if !platform::is_host() { tracing::info!("No cleanup needed on DPU."); return Ok(()); } diff --git a/crates/scout/src/main.rs b/crates/scout/src/main.rs index 695b2563d5..10c334e89e 100644 --- a/crates/scout/src/main.rs +++ b/crates/scout/src/main.rs @@ -55,6 +55,7 @@ mod discovery; mod firmware_upgrade; mod machine_validation; mod mlx_device; +mod platform; mod register; mod stream; mod tpm; diff --git a/crates/scout/src/platform.rs b/crates/scout/src/platform.rs new file mode 100644 index 0000000000..67eec980b9 --- /dev/null +++ b/crates/scout/src/platform.rs @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use smbioslib::{SMBiosSystemInformation, table_load_from_device}; + +/// Returns `true` when scout is running on a managed host (as opposed to a DPU). +pub(crate) fn is_host() -> bool { + match table_load_from_device() { + Ok(data) => data.any(|sys_info: SMBiosSystemInformation| { + !sys_info + .product_name() + .to_string() + .to_lowercase() + .contains("bluefield") + }), + Err(_err) => true, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn is_host_returns_bool_without_panicking() { + let _ = is_host(); + } +} diff --git a/crates/scout/src/register.rs b/crates/scout/src/register.rs index e02f23b91b..257e0308b6 100644 --- a/crates/scout/src/register.rs +++ b/crates/scout/src/register.rs @@ -24,7 +24,7 @@ use tracing::info; use tss_esapi::Context; use tss_esapi::handles::KeyHandle; -use crate::{CarbideClientError, attestation as attest}; +use crate::{CarbideClientError, attestation as attest, platform, tpm}; pub async fn run( forge_api: &str, @@ -36,7 +36,9 @@ pub async fn run( let mut hardware_info = enumerate_hardware()?; info!("Successfully enumerated hardware"); - let is_dpu = hardware_info.tpm_ek_certificate.is_none(); + // Missing TPM EK material must not be treated as DPU detection. DPUs are + // identified from platform SMBIOS data, not from TPM availability. + let is_dpu = !platform::is_host(); if machine_interface_id.is_none() && !is_dpu { return Err(CarbideClientError::GenericError( @@ -55,15 +57,31 @@ pub async fn run( crate::tpm::set_tpm_max_auth_fail()?; // create tss context - let mut tss_ctx = attest::create_context_from_path(tpm_path) - .map_err(|e| CarbideClientError::TpmError(format!("Could not create context: {e}")))?; + let mut tss_ctx = match attest::create_context_from_path(tpm_path) { + Ok(ctx) => ctx, + Err(e) => { + let err = CarbideClientError::TpmError(format!("Could not create context: {e}")); + if tpm::is_recoverable_tpm_client_error(&err) { + tpm::recover_tpm_and_reboot(tpm_path)?; + } + return Err(err); + } + }; // CHANGETO - supply context externally hardware_info.tpm_description = attest::get_tpm_description(&mut tss_ctx); - let result = attest::create_attest_key_info(&mut tss_ctx).map_err(|e| { - CarbideClientError::TpmError(format!("Could not create AttestKeyInfo: {e}")) - })?; + let result = match attest::create_attest_key_info(&mut tss_ctx) { + Ok(result) => result, + Err(e) => { + let err = + CarbideClientError::TpmError(format!("Could not create AttestKeyInfo: {e}")); + if tpm::is_recoverable_tpm_client_error(&err) { + tpm::recover_tpm_and_reboot(tpm_path)?; + } + return Err(err); + } + }; hardware_info.attest_key_info = Some(result.0); endorsement_key_handle_opt = Some(result.1); diff --git a/crates/scout/src/tpm.rs b/crates/scout/src/tpm.rs index 46ba4c49d1..b489b8f109 100644 --- a/crates/scout/src/tpm.rs +++ b/crates/scout/src/tpm.rs @@ -15,6 +15,9 @@ * limitations under the License. */ +use std::fs::File; +use std::io::Write; +use std::path::Path; use std::process::Command; use tss_esapi::handles::AuthHandle; @@ -22,6 +25,8 @@ use tss_esapi::interface_types::session_handles::AuthSession; use crate::{CarbideClientError, attestation as attest}; +pub(crate) const TPM_RECOVERY_ATTEMPTED_PATH: &str = "/tmp/tpm_recovery_reboot_attempted"; + // From https://superuser.com/questions/1404738/tpm-2-0-hardware-error-da-lockout-mode pub(crate) fn set_tpm_max_auth_fail() -> Result<(), CarbideClientError> { let output = Command::new("tpm2_dictionarylockout") @@ -81,3 +86,63 @@ pub(crate) fn clear_tpm(tpm_path: &str) -> Result<(), CarbideClientError> { tracing::info!("TPM lockout hierarchy clear completed"); Ok(()) } + +pub(crate) fn is_recoverable_tpm_client_error(error: &CarbideClientError) -> bool { + match error { + CarbideClientError::TpmError(message) => { + message.contains("Could not create AttestKeyInfo") + || message.contains("Could not create context") + || message.contains("TPM2_Clear") + } + _ => false, + } +} + +/// Clears the TPM and reboots the host once per boot cycle to recover from missing TPM material. +pub(crate) fn recover_tpm_and_reboot(tpm_path: &str) -> Result<(), CarbideClientError> { + if Path::new(TPM_RECOVERY_ATTEMPTED_PATH).exists() { + return Err(CarbideClientError::TpmError( + "TPM recovery was already attempted this boot cycle; refusing to loop".to_string(), + )); + } + + tracing::warn!("Attempting automated TPM clear and reboot to recover attestation state"); + clear_tpm(tpm_path)?; + + let mut marker = + File::create(TPM_RECOVERY_ATTEMPTED_PATH).map_err(CarbideClientError::StdIo)?; + marker + .write_all(b"tpm recovery reboot requested\n") + .map_err(CarbideClientError::StdIo)?; + + let output = Command::new("systemctl") + .arg("reboot") + .output() + .map_err(CarbideClientError::StdIo)?; + if !output.status.success() { + return Err(CarbideClientError::GenericError(format!( + "systemctl reboot failed with status {:?}: {}", + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ))); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn recoverable_tpm_errors_include_attest_key_info_failures() { + let err = CarbideClientError::TpmError("Could not create AttestKeyInfo: test".to_string()); + assert!(is_recoverable_tpm_client_error(&err)); + } + + #[test] + fn non_tpm_client_errors_are_not_recoverable() { + let err = CarbideClientError::GenericError("transport failed".to_string()); + assert!(!is_recoverable_tpm_client_error(&err)); + } +}