Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 3 additions & 17 deletions crates/scout/src/deprovision/scrabbing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,12 @@ use carbide_uuid::machine::MachineId;
use regex::Regex;
use scout::CarbideClientError;
use serde::Deserialize;
use smbioslib::SMBiosSystemInformation;
use tracing::Instrument;

use crate::cfg::Options;
use crate::client::create_forge_client;
use crate::deprovision::cmdrun;
use crate::{CarbideClientResult, IN_QEMU_VM};
use crate::{CarbideClientResult, IN_QEMU_VM, platform};

fn check_memory_overwrite_efi_var() -> Result<(), CarbideClientError> {
let name = match efivar::efi::Variable::from_str(
Expand Down Expand Up @@ -1093,22 +1092,9 @@ async fn do_cleanup(machine_id: &MachineId) -> CarbideClientResult<rpc::MachineC
Ok(cleanup_result)
}

fn is_host() -> bool {
match smbioslib::table_load_from_device() {
Ok(data) => data.any(|sys_info: SMBiosSystemInformation| {
!sys_info
.product_name()
.to_string()
.to_lowercase()
.contains("bluefield")
}),
Err(_err) => true,
}
}

pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClientResult<()> {
tracing::info!("full deprovision starts.");
if !is_host() {
if !platform::is_host() {
tracing::info!("full deprovision skipped, we are not running on a host.");
// do not send API cleanup_machine_completed
return Ok(());
Expand All @@ -1122,7 +1108,7 @@ pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClie
}

pub async fn run_no_api(tpm_path: &str) -> Result<(), CarbideClientError> {
if !is_host() {
if !platform::is_host() {
tracing::info!("No cleanup needed on DPU.");
return Ok(());
}
Expand Down
1 change: 1 addition & 0 deletions crates/scout/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ mod discovery;
mod firmware_upgrade;
mod machine_validation;
mod mlx_device;
mod platform;
mod register;
mod stream;
mod tpm;
Expand Down
42 changes: 42 additions & 0 deletions crates/scout/src/platform.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

use smbioslib::{SMBiosSystemInformation, table_load_from_device};

/// Returns `true` when scout is running on a managed host (as opposed to a DPU).
pub(crate) fn is_host() -> bool {
match table_load_from_device() {
Ok(data) => data.any(|sys_info: SMBiosSystemInformation| {
!sys_info
.product_name()
.to_string()
.to_lowercase()
.contains("bluefield")
}),
Err(_err) => true,
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn is_host_returns_bool_without_panicking() {
let _ = is_host();
}
}
32 changes: 25 additions & 7 deletions crates/scout/src/register.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use tracing::info;
use tss_esapi::Context;
use tss_esapi::handles::KeyHandle;

use crate::{CarbideClientError, attestation as attest};
use crate::{CarbideClientError, attestation as attest, platform, tpm};

pub async fn run(
forge_api: &str,
Expand All @@ -36,7 +36,9 @@ pub async fn run(
let mut hardware_info = enumerate_hardware()?;
info!("Successfully enumerated hardware");

let is_dpu = hardware_info.tpm_ek_certificate.is_none();
// Missing TPM EK material must not be treated as DPU detection. DPUs are
// identified from platform SMBIOS data, not from TPM availability.
let is_dpu = !platform::is_host();

if machine_interface_id.is_none() && !is_dpu {
return Err(CarbideClientError::GenericError(
Expand All @@ -55,15 +57,31 @@ pub async fn run(
crate::tpm::set_tpm_max_auth_fail()?;

// create tss context
let mut tss_ctx = attest::create_context_from_path(tpm_path)
.map_err(|e| CarbideClientError::TpmError(format!("Could not create context: {e}")))?;
let mut tss_ctx = match attest::create_context_from_path(tpm_path) {
Ok(ctx) => ctx,
Err(e) => {
let err = CarbideClientError::TpmError(format!("Could not create context: {e}"));
if tpm::is_recoverable_tpm_client_error(&err) {
tpm::recover_tpm_and_reboot(tpm_path)?;
}
return Err(err);
}
};

// CHANGETO - supply context externally
hardware_info.tpm_description = attest::get_tpm_description(&mut tss_ctx);

let result = attest::create_attest_key_info(&mut tss_ctx).map_err(|e| {
CarbideClientError::TpmError(format!("Could not create AttestKeyInfo: {e}"))
})?;
let result = match attest::create_attest_key_info(&mut tss_ctx) {
Ok(result) => result,
Err(e) => {
let err =
CarbideClientError::TpmError(format!("Could not create AttestKeyInfo: {e}"));
if tpm::is_recoverable_tpm_client_error(&err) {
tpm::recover_tpm_and_reboot(tpm_path)?;
}
return Err(err);
}
};

hardware_info.attest_key_info = Some(result.0);
endorsement_key_handle_opt = Some(result.1);
Expand Down
65 changes: 65 additions & 0 deletions crates/scout/src/tpm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,18 @@
* limitations under the License.
*/

use std::fs::File;
use std::io::Write;
use std::path::Path;
use std::process::Command;

use tss_esapi::handles::AuthHandle;
use tss_esapi::interface_types::session_handles::AuthSession;

use crate::{CarbideClientError, attestation as attest};

pub(crate) const TPM_RECOVERY_ATTEMPTED_PATH: &str = "/tmp/tpm_recovery_reboot_attempted";

// From https://superuser.com/questions/1404738/tpm-2-0-hardware-error-da-lockout-mode
pub(crate) fn set_tpm_max_auth_fail() -> Result<(), CarbideClientError> {
let output = Command::new("tpm2_dictionarylockout")
Expand Down Expand Up @@ -81,3 +86,63 @@ pub(crate) fn clear_tpm(tpm_path: &str) -> Result<(), CarbideClientError> {
tracing::info!("TPM lockout hierarchy clear completed");
Ok(())
}

pub(crate) fn is_recoverable_tpm_client_error(error: &CarbideClientError) -> bool {
match error {
CarbideClientError::TpmError(message) => {
message.contains("Could not create AttestKeyInfo")
|| message.contains("Could not create context")
|| message.contains("TPM2_Clear")
}
_ => false,
}
}

/// Clears the TPM and reboots the host once per boot cycle to recover from missing TPM material.
pub(crate) fn recover_tpm_and_reboot(tpm_path: &str) -> Result<(), CarbideClientError> {
if Path::new(TPM_RECOVERY_ATTEMPTED_PATH).exists() {
return Err(CarbideClientError::TpmError(
"TPM recovery was already attempted this boot cycle; refusing to loop".to_string(),
));
}

tracing::warn!("Attempting automated TPM clear and reboot to recover attestation state");
clear_tpm(tpm_path)?;

let mut marker =
File::create(TPM_RECOVERY_ATTEMPTED_PATH).map_err(CarbideClientError::StdIo)?;
marker
.write_all(b"tpm recovery reboot requested\n")
.map_err(CarbideClientError::StdIo)?;

let output = Command::new("systemctl")
.arg("reboot")
.output()
.map_err(CarbideClientError::StdIo)?;
if !output.status.success() {
return Err(CarbideClientError::GenericError(format!(
"systemctl reboot failed with status {:?}: {}",
output.status.code(),
String::from_utf8_lossy(&output.stderr)
)));
}

Ok(())
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn recoverable_tpm_errors_include_attest_key_info_failures() {
let err = CarbideClientError::TpmError("Could not create AttestKeyInfo: test".to_string());
assert!(is_recoverable_tpm_client_error(&err));
}

#[test]
fn non_tpm_client_errors_are_not_recoverable() {
let err = CarbideClientError::GenericError("transport failed".to_string());
assert!(!is_recoverable_tpm_client_error(&err));
}
}