From 4086978d25864ed67d446ba453ae286658988d27 Mon Sep 17 00:00:00 2001 From: Joshua Gilman Date: Sat, 2 May 2026 22:07:44 -0700 Subject: [PATCH] feat(labctl): harden Talos image build reliability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make `labctl bootstrap talos image build` resilient enough for durable clusters, not just disposable bootstrap. Mirrors the IncusOS path's SHA256 verification while adding the protections it lacks. - Atomic downloads: stream archive into a sibling .tmp-* file via os.CreateTemp, hash-on-stream against the published Image Factory sidecar (`.sha256`), and os.Rename only when the digest matches. A corrupt cached archive is detected on every run and redownloaded. Same atomic pattern is now used for the decompressed boot image. - Bound xz decompression to MaxBootImageBytes (4 GiB) using the same io.LimitReader pattern incusosimage already uses. - HTTP client gets explicit ResponseHeaderTimeout (30s) and IdleConnTimeout (90s) via httpupstream.NewHTTPClient instead of http.DefaultClient. No top-level Timeout — long downloads use context cancellation. Both Talos and IncusOS paths benefit. - Retry with exponential backoff (5 attempts, 500ms..16s, ±20% jitter) on connection errors, 5xx, and 429. 4xx and ctx errors are permanent. - Result now carries BootArtifactSHA256 and ConfigArtifactSHA256; --json output exposes them so operators can verify before flashing. Tests cover atomic-write commit, redownload-on-corrupt-cache, sidecar-mismatch rejection (with no leaked .tmp- file), retry on 5xx, no-retry on 4xx, and SHA256 sidecar parsing edge cases. testscript fixture now serves the matching .sha256 sidecar and asserts the new digest fields. Co-Authored-By: Claude Opus 4.7 (1M context) --- tools/labctl/cmd/labctl/main_test.go | 8 + .../testdata/script/talos_image_build.txtar | 2 + tools/labctl/go.mod | 2 +- .../internal/adapters/httpupstream/client.go | 161 +++++++++++++-- .../adapters/httpupstream/client_test.go | 133 +++++++++++++ .../internal/adapters/localfs/filesystem.go | 17 ++ .../labctl/internal/app/talosimage/service.go | 188 +++++++++++++++--- .../internal/app/talosimage/service_test.go | 155 ++++++++++++++- tools/labctl/internal/app/talosimage/types.go | 29 ++- tools/labctl/internal/cli/bootstrap.go | 40 ++-- .../internal/composition/composition.go | 2 +- 11 files changed, 667 insertions(+), 70 deletions(-) create mode 100644 tools/labctl/internal/adapters/httpupstream/client_test.go diff --git a/tools/labctl/cmd/labctl/main_test.go b/tools/labctl/cmd/labctl/main_test.go index 14b31c1..453b333 100644 --- a/tools/labctl/cmd/labctl/main_test.go +++ b/tools/labctl/cmd/labctl/main_test.go @@ -2,6 +2,8 @@ package main import ( "bytes" + "crypto/sha256" + "encoding/hex" "fmt" "net/http" "net/http/httptest" @@ -37,12 +39,18 @@ func setupTestScript(env *testscript.Env) error { if err != nil { return err } + talosArchiveSHA256 := sha256.Sum256(talosArchive) + talosArchiveDigest := hex.EncodeToString(talosArchiveSHA256[:]) + talosSidecar := []byte(talosArchiveDigest + " nocloud-amd64.raw.xz\n") + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { case "/repos/GilmanLab/secrets/contents/network/keycloak.sops.yaml": handleSecretFixture(env, w, r) case "/image/" + defaultTalosSchematicID + "/v1.13.0/nocloud-amd64.raw.xz": _, _ = w.Write(talosArchive) + case "/image/" + defaultTalosSchematicID + "/v1.13.0/nocloud-amd64.raw.xz.sha256": + _, _ = w.Write(talosSidecar) default: http.Error(w, fmt.Sprintf("unexpected path %s", r.URL.Path), http.StatusNotFound) } diff --git a/tools/labctl/cmd/labctl/testdata/script/talos_image_build.txtar b/tools/labctl/cmd/labctl/testdata/script/talos_image_build.txtar index 4e4c17a..651a534 100644 --- a/tools/labctl/cmd/labctl/testdata/script/talos_image_build.txtar +++ b/tools/labctl/cmd/labctl/testdata/script/talos_image_build.txtar @@ -31,7 +31,9 @@ exists .state/downloads/talos/376567988ad370138ad8b2698212367b8edcb69b5fd68c80be exec labctl bootstrap talos image build --json talos-valid.yaml stdout '"name":"talos-test"' stdout '"bootArtifactPath":' +stdout '"bootArtifactSHA256":"[0-9a-f]{64}"' stdout '"configArtifactPath":' +stdout '"configArtifactSHA256":"[0-9a-f]{64}"' stdout '"sourceVersion":"v1.13.0"' stdout '"sourceURL":' stdout 'nocloud-amd64.raw.xz' diff --git a/tools/labctl/go.mod b/tools/labctl/go.mod index 6febd1c..3b7395f 100644 --- a/tools/labctl/go.mod +++ b/tools/labctl/go.mod @@ -9,6 +9,7 @@ require ( github.com/aws/aws-sdk-go-v2 v1.41.7 github.com/aws/aws-sdk-go-v2/config v1.32.17 github.com/aws/aws-sdk-go-v2/service/lambda v1.90.1 + github.com/cenkalti/backoff/v4 v4.3.0 github.com/diskfs/go-diskfs v1.9.1 github.com/getsops/sops/v3 v3.12.2 github.com/gilmanlab/platform/schemas/lab v0.2.1 @@ -113,7 +114,6 @@ require ( github.com/butuzov/mirror v1.3.0 // indirect github.com/catenacyber/perfsprint v0.10.1 // indirect github.com/ccojocar/zxcvbn-go v1.0.4 // indirect - github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/charithe/durationcheck v0.0.11 // indirect github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc // indirect diff --git a/tools/labctl/internal/adapters/httpupstream/client.go b/tools/labctl/internal/adapters/httpupstream/client.go index b058ec4..cfb7bad 100644 --- a/tools/labctl/internal/adapters/httpupstream/client.go +++ b/tools/labctl/internal/adapters/httpupstream/client.go @@ -1,24 +1,63 @@ package httpupstream import ( + "bufio" "context" "encoding/json" + "errors" "fmt" "io" "net/http" + "strings" + "time" + + "github.com/cenkalti/backoff/v4" "github.com/gilmanlab/platform/tools/labctl/internal/app/incusosimage" ) -// Client fetches IncusOS upstream metadata and artifacts over HTTP. +const ( + // DefaultResponseHeaderTimeout bounds how long the client will wait for + // upstream response headers before failing the connection. + DefaultResponseHeaderTimeout = 30 * time.Second + // DefaultIdleConnTimeout bounds how long an idle keep-alive connection + // will be held open in the transport's idle pool. + DefaultIdleConnTimeout = 90 * time.Second + + retryMaxAttempts = 5 + retryInitialInterval = 500 * time.Millisecond + retryMaxInterval = 16 * time.Second + retryRandomization = 0.2 + + sha256HexLen = 64 +) + +// NewHTTPClient constructs a [*http.Client] with explicit transport timeouts +// suited to upstream artifact downloads. +// +// The returned client has no top-level Timeout — long-running downloads use +// context cancellation. ResponseHeaderTimeout prevents stalled connections +// from hanging forever waiting for response headers. +func NewHTTPClient() *http.Client { + return &http.Client{ + Transport: &http.Transport{ + Proxy: http.ProxyFromEnvironment, + ResponseHeaderTimeout: DefaultResponseHeaderTimeout, + IdleConnTimeout: DefaultIdleConnTimeout, + }, + } +} + +// Client fetches IncusOS and Talos upstream metadata and artifacts over HTTP. type Client struct { httpClient *http.Client } -// New constructs an HTTP upstream adapter. +// New constructs an HTTP upstream adapter. A nil httpClient selects the +// timeout-defaulted client returned by NewHTTPClient. func New(httpClient *http.Client) Client { if httpClient == nil { - httpClient = http.DefaultClient + httpClient = NewHTTPClient() } return Client{ @@ -43,6 +82,11 @@ func (c Client) FetchIndex(ctx context.Context, url string) (incusosimage.Index, } // Download opens a response body for an upstream artifact. +// +// The returned ReadCloser must be closed by the caller. Body reads are not +// retried; transient mid-stream failures are surfaced as errors and the +// caller is expected to verify integrity (for example, by streaming through +// a SHA256 hasher and comparing against a sidecar fetched via FetchSHA256). func (c Client) Download(ctx context.Context, url string) (io.ReadCloser, error) { response, err := c.get(ctx, url) if err != nil { @@ -52,21 +96,114 @@ func (c Client) Download(ctx context.Context, url string) (io.ReadCloser, error) return response.Body, nil } -func (c Client) get(ctx context.Context, url string) (*http.Response, error) { - request, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) +// FetchSHA256 fetches a checksum sidecar and returns the lowercase hex digest. +// +// The response body is expected to be in `sha256sum -c` format: a single +// line of " ". Only the first whitespace-delimited token is +// parsed; the filename and any trailing lines are ignored. +func (c Client) FetchSHA256(ctx context.Context, url string) (string, error) { + response, err := c.get(ctx, url) if err != nil { - return nil, fmt.Errorf("create request %q: %w", url, err) + return "", err + } + defer response.Body.Close() + + scanner := bufio.NewScanner(response.Body) + if !scanner.Scan() { + if err := scanner.Err(); err != nil { + return "", fmt.Errorf("read sha256 sidecar %q: %w", url, err) + } + + return "", fmt.Errorf("sha256 sidecar %q is empty", url) } - response, err := c.httpClient.Do(request) - if err != nil { - return nil, fmt.Errorf("GET %q: %w", url, err) + fields := strings.Fields(scanner.Text()) + if len(fields) == 0 { + return "", fmt.Errorf("sha256 sidecar %q has no digest", url) } - if response.StatusCode < http.StatusOK || response.StatusCode >= http.StatusMultipleChoices { - defer response.Body.Close() - return nil, fmt.Errorf("GET %q: unexpected HTTP status %s", url, response.Status) + digest := strings.ToLower(fields[0]) + if !isHexSHA256(digest) { + return "", fmt.Errorf("sha256 sidecar %q digest %q is not a 64-character hex string", url, digest) + } + + return digest, nil +} + +func (c Client) get(ctx context.Context, url string) (*http.Response, error) { + var response *http.Response + + operation := func() error { + request, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return backoff.Permanent(fmt.Errorf("create request %q: %w", url, err)) + } + + resp, err := c.httpClient.Do(request) + if err != nil { + return err + } + + if isRetryableStatus(resp.StatusCode) { + status := resp.Status + _ = resp.Body.Close() + + return fmt.Errorf("GET %q: retryable HTTP status %s", url, status) + } + if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusMultipleChoices { + status := resp.Status + _ = resp.Body.Close() + + return backoff.Permanent(fmt.Errorf("GET %q: unexpected HTTP status %s", url, status)) + } + + response = resp + + return nil + } + + policy := backoff.WithContext(newRetryBackoff(), ctx) + if err := backoff.Retry(operation, policy); err != nil { + if permanent, ok := errors.AsType[*backoff.PermanentError](err); ok { + return nil, permanent.Err + } + + return nil, err } return response, nil } + +func newRetryBackoff() backoff.BackOff { + expo := backoff.NewExponentialBackOff() + expo.InitialInterval = retryInitialInterval + expo.MaxInterval = retryMaxInterval + expo.RandomizationFactor = retryRandomization + expo.MaxElapsedTime = 0 + + return backoff.WithMaxRetries(expo, retryMaxAttempts-1) +} + +func isRetryableStatus(code int) bool { + if code == http.StatusTooManyRequests { + return true + } + + return code >= http.StatusInternalServerError && code <= 599 +} + +func isHexSHA256(s string) bool { + if len(s) != sha256HexLen { + return false + } + for _, r := range s { + switch { + case r >= '0' && r <= '9': + case r >= 'a' && r <= 'f': + default: + return false + } + } + + return true +} diff --git a/tools/labctl/internal/adapters/httpupstream/client_test.go b/tools/labctl/internal/adapters/httpupstream/client_test.go new file mode 100644 index 0000000..a6d164e --- /dev/null +++ b/tools/labctl/internal/adapters/httpupstream/client_test.go @@ -0,0 +1,133 @@ +package httpupstream_test + +import ( + "context" + "errors" + "io" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/gilmanlab/platform/tools/labctl/internal/adapters/httpupstream" +) + +func TestClientFetchSHA256ParsesChecksumLine(t *testing.T) { + const digest = "5fa3a23e3f12cf6f33b66e2eb1cd0f8df57f53efb15c1ab8c8f6bb3fa1e02b9d" + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = io.WriteString(w, digest+" nocloud-amd64.raw.xz\n") + })) + t.Cleanup(server.Close) + + client := httpupstream.New(server.Client()) + got, err := client.FetchSHA256(context.Background(), server.URL+"/x.sha256") + + require.NoError(t, err) + assert.Equal(t, digest, got) +} + +func TestClientFetchSHA256RejectsMalformedDigest(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = io.WriteString(w, "not-a-digest\n") + })) + t.Cleanup(server.Close) + + client := httpupstream.New(server.Client()) + _, err := client.FetchSHA256(context.Background(), server.URL+"/x.sha256") + + require.Error(t, err) + assert.Contains(t, err.Error(), "not a 64-character hex string") +} + +func TestClientFetchSHA256RejectsEmptyBody(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + t.Cleanup(server.Close) + + client := httpupstream.New(server.Client()) + _, err := client.FetchSHA256(context.Background(), server.URL+"/x.sha256") + + require.Error(t, err) + assert.Contains(t, err.Error(), "empty") +} + +func TestClientRetriesTransientFailures(t *testing.T) { + var calls atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + if calls.Add(1) <= 2 { + http.Error(w, "boom", http.StatusBadGateway) + + return + } + _, _ = io.WriteString(w, "ok") + })) + t.Cleanup(server.Close) + + client := httpupstream.New(server.Client()) + body, err := client.Download(context.Background(), server.URL+"/artifact") + + require.NoError(t, err) + t.Cleanup(func() { _ = body.Close() }) + + data, err := io.ReadAll(body) + require.NoError(t, err) + assert.Equal(t, "ok", string(data)) + assert.GreaterOrEqual(t, calls.Load(), int32(3), "expected at least three attempts") +} + +func TestClientDoesNotRetryClientErrors(t *testing.T) { + var calls atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + calls.Add(1) + http.Error(w, "nope", http.StatusNotFound) + })) + t.Cleanup(server.Close) + + client := httpupstream.New(server.Client()) + _, err := client.Download(context.Background(), server.URL+"/artifact") + + require.Error(t, err) + assert.Contains(t, err.Error(), "404") + assert.Equal(t, int32(1), calls.Load(), "4xx must not be retried") +} + +func TestClientStopsRetryingOnContextCancel(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "boom", http.StatusServiceUnavailable) + })) + t.Cleanup(server.Close) + + client := httpupstream.New(server.Client()) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond) + t.Cleanup(cancel) + + _, err := client.Download(ctx, server.URL+"/artifact") + + require.Error(t, err) + assert.True( + t, + errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) || isRetryableErr(err), + "expected ctx error or last retryable error, got %v", err, + ) +} + +func isRetryableErr(err error) bool { + return err != nil && (assertContains(err.Error(), "503") || assertContains(err.Error(), "retryable")) +} + +func assertContains(s, sub string) bool { + for i := range len(s) - len(sub) + 1 { + if s[i:i+len(sub)] == sub { + return true + } + } + + return false +} diff --git a/tools/labctl/internal/adapters/localfs/filesystem.go b/tools/labctl/internal/adapters/localfs/filesystem.go index 22fb5bd..354ad1b 100644 --- a/tools/labctl/internal/adapters/localfs/filesystem.go +++ b/tools/labctl/internal/adapters/localfs/filesystem.go @@ -6,6 +6,7 @@ import ( "os" "github.com/gilmanlab/platform/tools/labctl/internal/app/incusosimage" + "github.com/gilmanlab/platform/tools/labctl/internal/app/talosimage" ) // FileSystem reads and writes build artifacts on the local filesystem. @@ -49,3 +50,19 @@ func (FileSystem) Create(path string) (io.WriteCloser, error) { func (FileSystem) OpenReadWrite(path string) (incusosimage.WritableFile, error) { return os.OpenFile(path, os.O_RDWR, 0) } + +// CreateTemp creates a new uniquely-named temporary file in dir using the +// supplied name pattern. The returned TempFile must be closed by the caller. +func (FileSystem) CreateTemp(dir, pattern string) (talosimage.TempFile, error) { + return os.CreateTemp(dir, pattern) +} + +// Remove removes the named file. +func (FileSystem) Remove(path string) error { + return os.Remove(path) +} + +// Rename atomically replaces newPath with oldPath. +func (FileSystem) Rename(oldPath, newPath string) error { + return os.Rename(oldPath, newPath) +} diff --git a/tools/labctl/internal/app/talosimage/service.go b/tools/labctl/internal/app/talosimage/service.go index 4aeea61..28893c5 100644 --- a/tools/labctl/internal/app/talosimage/service.go +++ b/tools/labctl/internal/app/talosimage/service.go @@ -2,6 +2,8 @@ package talosimage import ( "context" + "crypto/sha256" + "encoding/hex" "errors" "fmt" "io" @@ -15,6 +17,12 @@ import ( const ( fileModeDir = 0o755 + + // MaxBootImageBytes bounds the decompressed boot image to defend against + // upstream artifacts whose decompressed size is unexpectedly large. + MaxBootImageBytes int64 = 4 * 1024 * 1024 * 1024 + + sha256Suffix = ".sha256" ) // Service builds Talos image artifacts. @@ -55,11 +63,18 @@ func (s Service) Build(ctx context.Context, request Request) (Result, error) { if err != nil { return Result{}, err } - err = s.downloadArchive(ctx, image.url, archivePath) + + expectedSHA256, err := s.upstream.FetchSHA256(ctx, image.url+sha256Suffix) + if err != nil { + return Result{}, fmt.Errorf("fetch Talos image checksum: %w", err) + } + + err = s.ensureCachedArchive(ctx, image.url, archivePath, expectedSHA256) if err != nil { return Result{}, err } - err = s.writeBootImage(archivePath, paths.bootArtifactPath) + + bootSHA256, err := s.writeBootImage(archivePath, paths.bootArtifactPath) if err != nil { return Result{}, err } @@ -68,20 +83,28 @@ func (s Service) Build(ctx context.Context, request Request) (Result, error) { if err != nil { return Result{}, err } - if err := s.configDisk.Build(paths.configArtifactPath, payload); err != nil { + err = s.configDisk.Build(paths.configArtifactPath, payload) + if err != nil { return Result{}, fmt.Errorf("build NoCloud cidata image %q: %w", paths.configArtifactPath, err) } + configSHA256, err := s.hashFile(paths.configArtifactPath) + if err != nil { + return Result{}, fmt.Errorf("hash NoCloud cidata image %q: %w", paths.configArtifactPath, err) + } + return Result{ - Name: string(request.Config.Name), - BootArtifactPath: paths.bootArtifactPath, - ConfigArtifactPath: paths.configArtifactPath, - SourceVersion: image.version, - SourceURL: image.url, - SourceSchematicID: image.schematicID, - Platform: image.platform, - Arch: image.arch, - Format: string(request.Config.Output.Format), + Name: string(request.Config.Name), + BootArtifactPath: paths.bootArtifactPath, + BootArtifactSHA256: bootSHA256, + ConfigArtifactPath: paths.configArtifactPath, + ConfigArtifactSHA256: configSHA256, + SourceVersion: image.version, + SourceURL: image.url, + SourceSchematicID: image.schematicID, + Platform: image.platform, + Arch: image.arch, + Format: string(request.Config.Output.Format), }, nil } @@ -119,14 +142,56 @@ func (s Service) prepareDownloadDirectory(archivePath string) error { return nil } -func (s Service) downloadArchive(ctx context.Context, url string, destination string) error { +// ensureCachedArchive guarantees that destination contains the archive whose +// SHA256 matches expected. A cached file with a matching digest is reused; a +// cached file with a divergent digest is removed and redownloaded; a missing +// destination triggers a fresh download. +func (s Service) ensureCachedArchive(ctx context.Context, url, destination, expected string) error { exists, err := s.files.IsFile(destination) if err != nil { return fmt.Errorf("check cached archive %q: %w", destination, err) } if exists { - return nil + actual, err := s.hashFile(destination) + if err != nil { + return fmt.Errorf("hash cached archive %q: %w", destination, err) + } + if actual == expected { + return nil + } + if err := s.files.Remove(destination); err != nil { + return fmt.Errorf("remove stale cached archive %q: %w", destination, err) + } + } + + return s.downloadArchive(ctx, url, destination, expected) +} + +// downloadArchive streams the archive into a sibling temp file, hashing as +// it writes, and atomically renames into place only when the streamed digest +// matches expected. A leftover temp file from a failed run is removed on +// best effort; a stranded `.tmp-*` is harmless because subsequent runs +// create new temp files with unique suffixes. +func (s Service) downloadArchive(ctx context.Context, url, destination, expected string) error { + parent := filepath.Dir(destination) + tempName := "." + filepath.Base(destination) + ".tmp-*" + + temp, err := s.files.CreateTemp(parent, tempName) + if err != nil { + return fmt.Errorf("create temporary archive in %q: %w", parent, err) } + tempPath := temp.Name() + + closed := false + committed := false + defer func() { + if !closed { + _ = temp.Close() + } + if !committed { + _ = s.files.Remove(tempPath) + } + }() source, err := s.upstream.Download(ctx, url) if err != nil { @@ -134,42 +199,107 @@ func (s Service) downloadArchive(ctx context.Context, url string, destination st } defer source.Close() - target, err := s.files.Create(destination) + hasher := sha256.New() + _, err = io.Copy(io.MultiWriter(temp, hasher), source) + if err != nil { + return fmt.Errorf("write temporary archive %q: %w", tempPath, err) + } + err = temp.Close() if err != nil { - return fmt.Errorf("create archive %q: %w", destination, err) + return fmt.Errorf("close temporary archive %q: %w", tempPath, err) + } + closed = true + + actual := hex.EncodeToString(hasher.Sum(nil)) + if actual != expected { + return fmt.Errorf("sha256 mismatch for %q: expected %s, got %s", url, expected, actual) } - defer target.Close() - if _, err := io.Copy(target, source); err != nil { - return fmt.Errorf("write archive %q: %w", destination, err) + err = s.files.Rename(tempPath, destination) + if err != nil { + return fmt.Errorf("install archive %q: %w", destination, err) } + committed = true return nil } -func (s Service) writeBootImage(archivePath string, bootArtifactPath string) error { +// writeBootImage decompresses the cached xz archive into a temp file beside +// bootArtifactPath, hashes the decompressed bytes, bounds the output to +// MaxBootImageBytes, and atomically renames into place. The returned digest +// is the lowercase hex SHA256 of the decompressed boot image. +func (s Service) writeBootImage(archivePath, bootArtifactPath string) (string, error) { archive, err := s.files.Open(archivePath) if err != nil { - return fmt.Errorf("open compressed archive %q: %w", archivePath, err) + return "", fmt.Errorf("open compressed archive %q: %w", archivePath, err) } defer archive.Close() raw, err := xz.NewReader(archive) if err != nil { - return fmt.Errorf("read xz archive %q: %w", archivePath, err) + return "", fmt.Errorf("read xz archive %q: %w", archivePath, err) } - target, err := s.files.Create(bootArtifactPath) + parent := filepath.Dir(bootArtifactPath) + tempName := "." + filepath.Base(bootArtifactPath) + ".tmp-*" + + temp, err := s.files.CreateTemp(parent, tempName) if err != nil { - return fmt.Errorf("create boot image %q: %w", bootArtifactPath, err) + return "", fmt.Errorf("create temporary boot image in %q: %w", parent, err) } - defer target.Close() + tempPath := temp.Name() - if _, err := io.Copy(target, raw); err != nil { - return fmt.Errorf("decompress boot image %q: %w", bootArtifactPath, err) + closed := false + committed := false + defer func() { + if !closed { + _ = temp.Close() + } + if !committed { + _ = s.files.Remove(tempPath) + } + }() + + hasher := sha256.New() + written, err := io.Copy(io.MultiWriter(temp, hasher), io.LimitReader(raw, MaxBootImageBytes+1)) + if err != nil { + return "", fmt.Errorf("decompress boot image %q: %w", bootArtifactPath, err) + } + if written > MaxBootImageBytes { + return "", fmt.Errorf( + "decompressed boot image %q exceeds maximum size %d bytes", + bootArtifactPath, + MaxBootImageBytes, + ) } + err = temp.Close() + if err != nil { + return "", fmt.Errorf("close temporary boot image %q: %w", tempPath, err) + } + closed = true - return nil + err = s.files.Rename(tempPath, bootArtifactPath) + if err != nil { + return "", fmt.Errorf("install boot image %q: %w", bootArtifactPath, err) + } + committed = true + + return hex.EncodeToString(hasher.Sum(nil)), nil +} + +func (s Service) hashFile(path string) (string, error) { + source, err := s.files.Open(path) + if err != nil { + return "", fmt.Errorf("open %q: %w", path, err) + } + defer source.Close() + + hasher := sha256.New() + if _, err := io.Copy(hasher, source); err != nil { + return "", fmt.Errorf("read %q: %w", path, err) + } + + return hex.EncodeToString(hasher.Sum(nil)), nil } func (s Service) buildConfigDiskPayload(baseDir string, config schematalos.MachineConfig) (ConfigDiskPayload, error) { @@ -273,7 +403,7 @@ func resolvePaths(baseDir string, output schematalos.ImageOutput) (buildPaths, e }, nil } -func resolvePath(baseDir string, path string) (string, error) { +func resolvePath(baseDir, path string) (string, error) { if filepath.IsAbs(path) { return filepath.Clean(path), nil } diff --git a/tools/labctl/internal/app/talosimage/service_test.go b/tools/labctl/internal/app/talosimage/service_test.go index 42ac28e..4e8ab5d 100644 --- a/tools/labctl/internal/app/talosimage/service_test.go +++ b/tools/labctl/internal/app/talosimage/service_test.go @@ -3,6 +3,9 @@ package talosimage_test import ( "bytes" "context" + "crypto/sha256" + "encoding/hex" + "errors" "io" "os" "path/filepath" @@ -48,11 +51,13 @@ func TestServiceBuildWritesBootImageAndNoCloudPayload(t *testing.T) { assert.Equal(t, "nocloud", result.Platform) assert.Equal(t, "amd64", result.Arch) assert.Equal(t, "img", result.Format) - assert.Equal(t, result.SourceURL, upstream.url) + assert.Equal(t, result.SourceURL, upstream.downloadURL) + assert.Equal(t, result.SourceURL+".sha256", upstream.checksumURL) boot, err := os.ReadFile(filepath.Clean(result.BootArtifactPath)) require.NoError(t, err) assert.Equal(t, raw, boot) + assert.Equal(t, hashHex(raw), result.BootArtifactSHA256) require.Equal(t, result.ConfigArtifactPath, configDisk.path) assert.Equal(t, []byte("machine:\n type: controlplane\n"), configDisk.payload.UserData) @@ -60,6 +65,10 @@ func TestServiceBuildWritesBootImageAndNoCloudPayload(t *testing.T) { assert.Contains(t, string(configDisk.payload.MetaData), "local-hostname: bootstrap-controlplane-1") assert.Equal(t, []byte("version: 1\n"), configDisk.payload.NetworkConfig) + cidata, err := os.ReadFile(filepath.Clean(result.ConfigArtifactPath)) + require.NoError(t, err) + assert.Equal(t, hashHex(cidata), result.ConfigArtifactSHA256) + cachePath := filepath.Join( baseDir, ".state", @@ -80,6 +89,116 @@ func TestServiceBuildWritesBootImageAndNoCloudPayload(t *testing.T) { assert.Equal(t, 1, upstream.downloads, "expected second build to reuse the cached archive") } +func TestServiceBuildRedownloadsCorruptCache(t *testing.T) { + baseDir := t.TempDir() + writeFixture(t, baseDir, "controlplane.yaml", "machine:\n type: controlplane\n") + + raw := []byte("talos-raw-image") + upstream := &fakeUpstream{artifact: xzBytes(t, raw)} + configDisk := &fakeConfigDiskBuilder{} + service := talosimage.NewService(talosimage.Dependencies{ + Upstream: upstream, + Files: localfs.New(), + ConfigDisk: configDisk, + }) + + cachePath := filepath.Join( + baseDir, + ".state", + "downloads", + "talos", + "376567988ad370138ad8b2698212367b8edcb69b5fd68c80be1f2ec7d603b4ba", + "v1.13.0", + "nocloud-amd64.raw.xz", + ) + require.NoError(t, os.MkdirAll(filepath.Dir(cachePath), 0o755)) + require.NoError(t, os.WriteFile(cachePath, []byte("corrupt-cache-content"), 0o600)) + + cfg := testConfig() + cfg.Config.NetworkConfig = schematalos.FileInput{} + _, err := service.Build(context.Background(), talosimage.Request{ + Config: cfg, + BaseDir: baseDir, + }) + + require.NoError(t, err) + assert.Equal(t, 1, upstream.downloads, "expected corrupt cache to trigger a redownload") + + cached, err := os.ReadFile(cachePath) + require.NoError(t, err) + assert.Equal(t, upstream.artifact, cached) +} + +func TestServiceBuildRejectsSidecarMismatch(t *testing.T) { + baseDir := t.TempDir() + writeFixture(t, baseDir, "controlplane.yaml", "machine:\n type: controlplane\n") + + raw := []byte("talos-raw-image") + upstream := &fakeUpstream{ + artifact: xzBytes(t, raw), + // Force the sidecar response to a digest that does not match what + // the service will compute on the streamed body. + sha256Override: "0000000000000000000000000000000000000000000000000000000000000000", + } + configDisk := &fakeConfigDiskBuilder{} + service := talosimage.NewService(talosimage.Dependencies{ + Upstream: upstream, + Files: localfs.New(), + ConfigDisk: configDisk, + }) + + cfg := testConfig() + cfg.Config.NetworkConfig = schematalos.FileInput{} + _, err := service.Build(context.Background(), talosimage.Request{ + Config: cfg, + BaseDir: baseDir, + }) + + require.Error(t, err) + assert.Contains(t, err.Error(), "sha256 mismatch") + + downloadsDir := filepath.Join( + baseDir, + ".state", + "downloads", + "talos", + "376567988ad370138ad8b2698212367b8edcb69b5fd68c80be1f2ec7d603b4ba", + "v1.13.0", + ) + entries, err := os.ReadDir(downloadsDir) + require.NoError(t, err) + for _, entry := range entries { + assert.NotContains(t, entry.Name(), ".tmp-", "temporary archive must not be left behind") + } +} + +func TestServiceBuildPropagatesSidecarError(t *testing.T) { + baseDir := t.TempDir() + writeFixture(t, baseDir, "controlplane.yaml", "machine:\n type: controlplane\n") + + upstream := &fakeUpstream{ + artifact: xzBytes(t, []byte("talos-raw-image")), + sha256Error: errors.New("sidecar unreachable"), + } + configDisk := &fakeConfigDiskBuilder{} + service := talosimage.NewService(talosimage.Dependencies{ + Upstream: upstream, + Files: localfs.New(), + ConfigDisk: configDisk, + }) + + cfg := testConfig() + cfg.Config.NetworkConfig = schematalos.FileInput{} + _, err := service.Build(context.Background(), talosimage.Request{ + Config: cfg, + BaseDir: baseDir, + }) + + require.Error(t, err) + assert.Contains(t, err.Error(), "fetch Talos image checksum") + assert.Equal(t, 0, upstream.downloads, "no download must happen when the sidecar fetch fails") +} + func testConfig() schematalos.ImageBuild { return schematalos.ImageBuild{ Name: "talos-test", @@ -114,18 +233,33 @@ func testConfig() schematalos.ImageBuild { } type fakeUpstream struct { - artifact []byte - downloads int - url string + artifact []byte + sha256Override string + sha256Error error + downloads int + downloadURL string + checksumURL string } func (f *fakeUpstream) Download(_ context.Context, url string) (io.ReadCloser, error) { f.downloads++ - f.url = url + f.downloadURL = url return io.NopCloser(bytes.NewReader(f.artifact)), nil } +func (f *fakeUpstream) FetchSHA256(_ context.Context, url string) (string, error) { + f.checksumURL = url + if f.sha256Error != nil { + return "", f.sha256Error + } + if f.sha256Override != "" { + return f.sha256Override, nil + } + + return hashHex(f.artifact), nil +} + type fakeConfigDiskBuilder struct { path string payload talosimage.ConfigDiskPayload @@ -135,10 +269,11 @@ func (f *fakeConfigDiskBuilder) Build(path string, payload talosimage.ConfigDisk f.path = path f.payload = payload - return nil + // Write a placeholder file so the service can hash the artifact. + return os.WriteFile(path, payload.UserData, 0o600) } -func writeFixture(t *testing.T, dir string, name string, data string) { +func writeFixture(t *testing.T, dir, name, data string) { t.Helper() require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(data), 0o600)) @@ -156,3 +291,9 @@ func xzBytes(t *testing.T, data []byte) []byte { return buffer.Bytes() } + +func hashHex(data []byte) string { + sum := sha256.Sum256(data) + + return hex.EncodeToString(sum[:]) +} diff --git a/tools/labctl/internal/app/talosimage/types.go b/tools/labctl/internal/app/talosimage/types.go index 1f2c29a..162e08a 100644 --- a/tools/labctl/internal/app/talosimage/types.go +++ b/tools/labctl/internal/app/talosimage/types.go @@ -22,8 +22,12 @@ type Result struct { Name string `json:"name"` // BootArtifactPath is the local path to the Talos boot disk image. BootArtifactPath string `json:"bootArtifactPath"` + // BootArtifactSHA256 is the lowercase hex SHA256 digest of the boot disk image. + BootArtifactSHA256 string `json:"bootArtifactSHA256"` // ConfigArtifactPath is the local path to the NoCloud cidata image. ConfigArtifactPath string `json:"configArtifactPath"` + // ConfigArtifactSHA256 is the lowercase hex SHA256 digest of the NoCloud cidata image. + ConfigArtifactSHA256 string `json:"configArtifactSHA256"` // SourceVersion is the selected upstream Talos Linux version. SourceVersion string `json:"sourceVersion"` // SourceURL is the selected upstream Talos Image Factory artifact URL. @@ -40,7 +44,7 @@ type Result struct { // Dependencies groups external ports needed to build Talos image artifacts. type Dependencies struct { - // Upstream downloads Talos Image Factory artifacts. + // Upstream downloads Talos Image Factory artifacts and their digests. Upstream Upstream // Files reads and writes local build artifacts. Files FileSystem @@ -48,17 +52,38 @@ type Dependencies struct { ConfigDisk ConfigDiskBuilder } -// Upstream downloads Talos Image Factory artifact bytes. +// Upstream downloads Talos Image Factory artifact bytes and digests. type Upstream interface { + // Download opens a streaming response body for the artifact at url. Download(ctx context.Context, url string) (io.ReadCloser, error) + // FetchSHA256 returns the lowercase hex SHA256 digest published at url. + FetchSHA256(ctx context.Context, url string) (string, error) } // FileSystem describes the local filesystem behavior used by the builder. type FileSystem interface { + // MkdirAll creates a directory and any missing parents. MkdirAll(path string, perm fs.FileMode) error + // IsFile reports whether path exists and is a regular file. IsFile(path string) (bool, error) + // Open opens path for reading. Open(path string) (io.ReadCloser, error) + // Create creates or truncates path for writing. Create(path string) (io.WriteCloser, error) + // CreateTemp creates a new uniquely-named temporary file in dir matching + // the supplied name pattern. Callers must close the returned TempFile. + CreateTemp(dir, pattern string) (TempFile, error) + // Remove removes the named file. + Remove(path string) error + // Rename atomically replaces newPath with oldPath. + Rename(oldPath, newPath string) error +} + +// TempFile is a filesystem-backed temporary file with a known on-disk path. +type TempFile interface { + io.WriteCloser + // Name returns the on-disk path of the temporary file. + Name() string } // ConfigDiskBuilder writes NoCloud cidata images. diff --git a/tools/labctl/internal/cli/bootstrap.go b/tools/labctl/internal/cli/bootstrap.go index d7bdb34..16de1d5 100644 --- a/tools/labctl/internal/cli/bootstrap.go +++ b/tools/labctl/internal/cli/bootstrap.go @@ -27,15 +27,17 @@ type imageBuildOutput struct { } type talosImageBuildOutput struct { - Name string `json:"name"` - BootArtifactPath string `json:"bootArtifactPath"` - ConfigArtifactPath string `json:"configArtifactPath"` - SourceVersion string `json:"sourceVersion"` - SourceURL string `json:"sourceURL"` - SourceSchematicID string `json:"sourceSchematicID"` - Platform string `json:"platform"` - Arch string `json:"arch"` - Format string `json:"format"` + Name string `json:"name"` + BootArtifactPath string `json:"bootArtifactPath"` + BootArtifactSHA256 string `json:"bootArtifactSHA256"` + ConfigArtifactPath string `json:"configArtifactPath"` + ConfigArtifactSHA256 string `json:"configArtifactSHA256"` + SourceVersion string `json:"sourceVersion"` + SourceURL string `json:"sourceURL"` + SourceSchematicID string `json:"sourceSchematicID"` + Platform string `json:"platform"` + Arch string `json:"arch"` + Format string `json:"format"` } type imageBuildSecretsFlags struct { @@ -276,15 +278,17 @@ func renderImageBuildResult(result incusosimage.Result, opts Options, flags *roo func renderTalosImageBuildResult(result talosimage.Result, opts Options, flags *rootFlags, jsonOutput bool) error { output := talosImageBuildOutput{ - Name: result.Name, - BootArtifactPath: result.BootArtifactPath, - ConfigArtifactPath: result.ConfigArtifactPath, - SourceVersion: result.SourceVersion, - SourceURL: result.SourceURL, - SourceSchematicID: result.SourceSchematicID, - Platform: result.Platform, - Arch: result.Arch, - Format: result.Format, + Name: result.Name, + BootArtifactPath: result.BootArtifactPath, + BootArtifactSHA256: result.BootArtifactSHA256, + ConfigArtifactPath: result.ConfigArtifactPath, + ConfigArtifactSHA256: result.ConfigArtifactSHA256, + SourceVersion: result.SourceVersion, + SourceURL: result.SourceURL, + SourceSchematicID: result.SourceSchematicID, + Platform: result.Platform, + Arch: result.Arch, + Format: result.Format, } if jsonOutput { diff --git a/tools/labctl/internal/composition/composition.go b/tools/labctl/internal/composition/composition.go index 87b6717..ecbd8dc 100644 --- a/tools/labctl/internal/composition/composition.go +++ b/tools/labctl/internal/composition/composition.go @@ -57,7 +57,7 @@ func New(input Input) Dependencies { httpClient := input.HTTPClient if httpClient == nil { - httpClient = http.DefaultClient + httpClient = httpupstream.NewHTTPClient() } files := localfs.New() broker := githubbroker.NewProvider(nil)