diff --git a/cmd/buildkitd/main.go b/cmd/buildkitd/main.go index 7de17c050559..9827f07b1805 100644 --- a/cmd/buildkitd/main.go +++ b/cmd/buildkitd/main.go @@ -68,6 +68,7 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" + "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/exporters/prometheus" "go.opentelemetry.io/otel/propagation" sdkmetric "go.opentelemetry.io/otel/sdk/metric" @@ -297,6 +298,13 @@ func main() { } closers = append(closers, mp.Shutdown) + // Register the meter provider globally so that callers using + // otel.Meter(...) (e.g. frontend/gateway preface metrics) see + // the configured Prometheus + OTLP readers instead of the + // no-op default. Without this, otel.Meter(...) returns a + // no-op meter and instrument recordings are silently dropped. + otel.SetMeterProvider(mp) + statsHandler := tracing.ServerStatsHandler( otelgrpc.WithTracerProvider(tp), otelgrpc.WithMeterProvider(mp), diff --git a/executor/runcexecutor/executor.go b/executor/runcexecutor/executor.go index c4e5f8fd0674..8d1dd334084a 100644 --- a/executor/runcexecutor/executor.go +++ b/executor/runcexecutor/executor.go @@ -328,6 +328,12 @@ func (w *runcExecutor) Run(ctx context.Context, id string, root executor.Mount, trace.SpanFromContext(ctx).AddEvent("Container created") err = w.run(ctx, id, bundle, process, func() { startedOnce.Do(func() { + // Mirrors the existing "> creating" log line above. The + // delta between "creating" and "started" is the runc + // bundle setup + exec latency, which is the dominant + // budget consumer for the gateway HTTP/2 preface deadline + // when the storage backend is contended. + bklog.G(ctx).Debugf("> started %s %v", id, meta.Args) trace.SpanFromContext(ctx).AddEvent("Container started") if started != nil { close(started) diff --git a/frontend/gateway/dup_unix_test.go b/frontend/gateway/dup_unix_test.go new file mode 100644 index 000000000000..845c477cc2f5 --- /dev/null +++ b/frontend/gateway/dup_unix_test.go @@ -0,0 +1,13 @@ +//go:build !windows + +package gateway + +import "syscall" + +// dupFD returns a duplicate of the given file descriptor. Used only by +// tests in this package to simulate the runc-spawned child process +// holding an inherited fd of one end of an os.Pipe pair, so that +// closing the parent's end does not cause the other end to see EOF. +func dupFD(fd int) (int, error) { + return syscall.Dup(fd) +} diff --git a/frontend/gateway/dup_windows_test.go b/frontend/gateway/dup_windows_test.go new file mode 100644 index 000000000000..80739e25f3ff --- /dev/null +++ b/frontend/gateway/dup_windows_test.go @@ -0,0 +1,11 @@ +//go:build windows + +package gateway + +import "errors" + +// dupFD is a stub on windows; tests that call it gate themselves on +// runtime.GOOS != "windows" and do not exercise this path. +func dupFD(_ int) (int, error) { + return 0, errors.New("dupFD not supported on windows") +} diff --git a/frontend/gateway/gateway.go b/frontend/gateway/gateway.go index 983942b1cd47..39960eaed49c 100644 --- a/frontend/gateway/gateway.go +++ b/frontend/gateway/gateway.go @@ -52,6 +52,9 @@ import ( digest "github.com/opencontainers/go-digest" ocispecs "github.com/opencontainers/image-spec/specs-go/v1" "github.com/pkg/errors" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "golang.org/x/net/http2" "golang.org/x/sync/errgroup" spb "google.golang.org/genproto/googleapis/rpc/status" @@ -502,6 +505,42 @@ type conn struct { io.Closer } +// Close closes all three underlying file descriptors, not just the +// io.Closer field. This matters because newPipe builds the conn from +// two os.Pipe() pairs: the Reader and Closer are different ends of +// different pipes (pr2 and pw2 respectively), and the other ends +// (pw2 inherited as Stdout, pr1 inherited as Stdin) are passed to the +// spawned frontend container's runc process. Closing only the Closer +// (the previous, promoted-method behavior) does not cause the +// in-process Reader to see EOF, because the frontend process still +// holds a duplicate fd of that pipe end via its inherited Stdout. +// readPrefaceWithTimeout's timeout path relies on Close to unblock +// its io.ReadFull goroutine reading from Reader; closing the Reader +// fd directly causes any in-flight or subsequent Read to return +// immediately with a closed-pipe error, which is the documented +// behavior of os.File.Close while another goroutine is in Read. +// +// Concurrent and repeated calls are safe: os.File handles internal +// synchronisation, and existing call sites (defer lbf.conn.Close, +// the ctx.Done goroutine in serve, and read-error paths) ignore the +// returned error. +func (s *conn) Close() error { + var firstErr error + closeIfCloser := func(v any) { + if c, ok := v.(io.Closer); ok { + if err := c.Close(); err != nil && firstErr == nil { + firstErr = err + } + } + } + closeIfCloser(s.Reader) + closeIfCloser(s.Writer) + if err := s.Closer.Close(); err != nil && firstErr == nil { + firstErr = err + } + return firstErr +} + func (s *conn) LocalAddr() net.Addr { return dummyAddr{} } @@ -1720,13 +1759,178 @@ func (lbf *llbBridgeForwarder) cloneRef(id string) (solver.ResultProxy, error) { return s2, nil } +// HTTP/2 client preface handling for gateway connections. +// +// The vendored golang.org/x/net/http2 package enforces a hard-coded 10s +// preface timeout in http2.Server.ServeConn (see +// vendor/golang.org/x/net/http2/server.go: const prefaceTimeout). When +// buildkitd spawns a frontend container (e.g. dockerfile-frontend) and +// slow runc bundle setup, overlay-mount, or cold-cache rootfs reads +// delay the frontend's first preface bytes past 10s, ServeConn aborts +// with errPrefaceTimeout, the gateway tears down the gRPC connection, +// and the build fails with `frontend grpc server closed unexpectedly`. +// +// The gateway now reads the 24-byte client preface itself with a +// configurable, longer-by-default deadline, validates it, and then +// hands the already-drained net.Conn straight to http2.Server.ServeConn +// with ServeConnOpts.SawClientPreface=true. With that flag set, the +// vendored readPreface() returns nil immediately and skips the +// hard-coded 10s prefaceTimeout entirely. +const ( + // httpClientPreface is the 24-byte HTTP/2 client connection preface, + // as defined in RFC 7540 ยง3.5 and exported in golang.org/x/net/http2 + // as ClientPreface. The format is fixed by spec. + httpClientPreface = "PRI * HTTP/2.0\r\n\r\nSM\r\n\r\n" + + // defaultPrefaceTimeout is the default ceiling on how long the + // gateway waits for the frontend container to send its HTTP/2 client + // preface. Significantly longer than the vendored 10s upstream + // prefaceTimeout to absorb slow runc bundle setup and cold-cache + // rootfs reads on contended storage backends. + defaultPrefaceTimeout = 60 * time.Second + + // prefaceTimeoutEnvVar lets operators tune the preface wait without a + // binary rebuild. Value is parsed as a Go time.Duration string, + // e.g. "30s", "2m". Invalid or non-positive values fall back to + // defaultPrefaceTimeout. + prefaceTimeoutEnvVar = "BUILDKIT_GATEWAY_PREFACE_TIMEOUT" +) + +var ( + gatewayMeter = otel.Meter("github.com/moby/buildkit/frontend/gateway") + gatewayMetricsOnce sync.Once + prefaceWaitHistogram metric.Float64Histogram + prefaceErrorCounter metric.Int64Counter +) + +func initGatewayMetrics() { + gatewayMetricsOnce.Do(func() { + if h, err := gatewayMeter.Float64Histogram( + "buildkit_frontend_preface_wait_seconds", + metric.WithDescription("Time the gateway waited for the HTTP/2 client preface from the frontend container after spawning it."), + metric.WithUnit("s"), + ); err == nil { + prefaceWaitHistogram = h + } + if c, err := gatewayMeter.Int64Counter( + "buildkit_frontend_preface_error_total", + metric.WithDescription("Number of times the gateway failed to receive the HTTP/2 client preface from the frontend container, partitioned by reason."), + ); err == nil { + prefaceErrorCounter = c + } + }) +} + +// prefaceReadTimeout returns the configured timeout for the gateway's +// HTTP/2 client preface pre-read. Defaults to defaultPrefaceTimeout, +// overridable via the BUILDKIT_GATEWAY_PREFACE_TIMEOUT environment +// variable. +func prefaceReadTimeout() time.Duration { + if v := os.Getenv(prefaceTimeoutEnvVar); v != "" { + if d, err := time.ParseDuration(v); err == nil && d > 0 { + return d + } + bklog.L.Warnf("frontend gateway: invalid %s=%q, falling back to %s", prefaceTimeoutEnvVar, v, defaultPrefaceTimeout) + } + return defaultPrefaceTimeout +} + +var ( + errPrefaceReadTimeout = errors.New("timed out reading HTTP/2 client preface from frontend container") + errPrefaceMismatch = errors.New("frontend container did not send a valid HTTP/2 client preface") +) + +// readPrefaceWithTimeout reads exactly len(httpClientPreface) bytes from +// conn or returns an error after timeout. Mirrors the timer-race +// approach in vendored x/net/http2/server.go:readPreface, which is +// required because the gateway's pipe-based net.Conn (see *conn above) +// overrides the deadline setters to no-op so SetReadDeadline cannot be +// used to bound the read. The returned bytes are validated against +// httpClientPreface; a mismatch returns errPrefaceMismatch. +func readPrefaceWithTimeout(conn net.Conn, timeout time.Duration) error { + buf := make([]byte, len(httpClientPreface)) + errc := make(chan error, 1) + go func() { + _, err := io.ReadFull(conn, buf) + errc <- err + }() + timer := time.NewTimer(timeout) + defer timer.Stop() + select { + case <-timer.C: + // Close the underlying connection to unblock the io.ReadFull + // goroutine reading from conn. *conn.Close (defined above) + // closes the Reader fd directly, which causes the in-flight + // Read to return immediately with a closed-pipe error so the + // goroutine returns and writes to the buffered errc channel. + // errc is sized 1 so even if conn.Close were a no-op (e.g. a + // hypothetical net.Conn implementation that doesn't unblock + // Read on Close), the goroutine would still not leak: it would + // remain parked on the Read until the underlying transport + // closes, then write to the already-buffered channel and exit. + _ = conn.Close() + return errPrefaceReadTimeout + case err := <-errc: + if err != nil { + return err + } + if string(buf) != httpClientPreface { + return errPrefaceMismatch + } + return nil + } +} + func serve(ctx context.Context, grpcServer *grpc.Server, conn net.Conn) { + initGatewayMetrics() go func() { <-ctx.Done() conn.Close() }() - bklog.G(ctx).Debugf("serving grpc connection") - (&http2.Server{}).ServeConn(conn, &http2.ServeConnOpts{Handler: grpcServer}) + + timeout := prefaceReadTimeout() + start := time.Now() + err := readPrefaceWithTimeout(conn, timeout) + elapsed := time.Since(start) + + logger := bklog.G(ctx). + WithField("preface_wait_ms", elapsed.Milliseconds()). + WithField("preface_timeout_ms", timeout.Milliseconds()) + + if err != nil { + reason := "read_error" + switch { + case errors.Is(err, errPrefaceReadTimeout): + reason = "timeout" + case errors.Is(err, errPrefaceMismatch): + reason = "mismatch" + case errors.Is(err, io.EOF), errors.Is(err, io.ErrUnexpectedEOF): + reason = "eof" + } + if prefaceErrorCounter != nil { + prefaceErrorCounter.Add(ctx, 1, metric.WithAttributes(attribute.String("reason", reason))) + } + logger.WithField("preface_error_reason", reason). + Warnf("frontend serve: failed to read HTTP/2 client preface from frontend container: %v", err) + // ensure conn is closed so the upstream goroutine observes the + // failure as server-closed rather than blocking on Run() + _ = conn.Close() + return + } + + if prefaceWaitHistogram != nil { + prefaceWaitHistogram.Record(ctx, elapsed.Seconds()) + } + logger.Debugf("frontend serve: received HTTP/2 client preface") + + // SawClientPreface tells http2.Server.ServeConn that the 24-byte + // HTTP/2 client preface has already been consumed from conn, so its + // internal readPreface() returns nil immediately and skips the + // hard-coded 10s prefaceTimeout. + (&http2.Server{}).ServeConn(conn, &http2.ServeConnOpts{ + Handler: grpcServer, + SawClientPreface: true, + }) } type markTypeFrontend struct{} diff --git a/frontend/gateway/gateway_test.go b/frontend/gateway/gateway_test.go index 79b88d7c3301..be788abd053d 100644 --- a/frontend/gateway/gateway_test.go +++ b/frontend/gateway/gateway_test.go @@ -1,7 +1,13 @@ package gateway import ( + "errors" + "io" + "net" + "os" + "runtime" "testing" + "time" "github.com/stretchr/testify/require" ) @@ -43,3 +49,167 @@ func TestCheckSourceIsAllowed(t *testing.T) { err = gw.checkSourceIsAllowed("docker.io/library/alpine") require.NoError(t, err) } + +func TestReadPrefaceWithTimeoutSuccess(t *testing.T) { + c1, c2 := net.Pipe() + defer c1.Close() + defer c2.Close() + + preface := []byte("PRI * HTTP/2.0\r\n\r\nSM\r\n\r\n") + + go func() { + _, _ = c2.Write(preface) + }() + + require.NoError(t, readPrefaceWithTimeout(c1, 5*time.Second)) +} + +func TestReadPrefaceWithTimeoutFiresOnSlowSender(t *testing.T) { + c1, c2 := net.Pipe() + defer c2.Close() + + // Sender writes nothing; the timer race must fire and close c1. + start := time.Now() + err := readPrefaceWithTimeout(c1, 50*time.Millisecond) + elapsed := time.Since(start) + + require.ErrorIs(t, err, errPrefaceReadTimeout) + require.GreaterOrEqual(t, elapsed, 40*time.Millisecond) + require.Less(t, elapsed, 2*time.Second) + + // c1 should be closed by readPrefaceWithTimeout on the timeout path + // to unblock the inner io.ReadFull goroutine. + _, werr := c1.Write([]byte("x")) + require.Error(t, werr) +} + +func TestReadPrefaceWithTimeoutMismatch(t *testing.T) { + c1, c2 := net.Pipe() + defer c1.Close() + defer c2.Close() + + // Sender writes 24 bytes that do NOT match the HTTP/2 client preface. + bogus := []byte("NOT-A-VALID-PREFACE-XXXX") + require.Equal(t, len("PRI * HTTP/2.0\r\n\r\nSM\r\n\r\n"), len(bogus)) + go func() { + _, _ = c2.Write(bogus) + }() + + require.ErrorIs(t, readPrefaceWithTimeout(c1, 5*time.Second), errPrefaceMismatch) +} + +// TestConnCloseUnblocksReaderInPipeMode reproduces the production conn +// shape (Reader and Closer are different ends of different os.Pipe +// pairs, with the "other ends" held by a separate process via +// inherited fds) and asserts that *conn.Close unblocks an in-flight +// Read on the Reader side. Without an explicit Close on the Reader fd, +// closing only the Closer leaves the read parked because the +// duplicate fd (modeled here by a kept-alive pw2Dup) prevents the +// kernel from sending EOF on the read side. This is the goroutine-leak +// failure mode in readPrefaceWithTimeout's timeout path. +func TestConnCloseUnblocksReaderInPipeMode(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("os.Pipe semantics around dup'd fds differ on windows; production runc-spawn path is linux-only") + } + + pr1, pw1, err := os.Pipe() + require.NoError(t, err) + pr2, pw2, err := os.Pipe() + require.NoError(t, err) + + // Simulate the runc-spawned frontend's inherited Stdout: a duplicate + // fd of pw2 held by the "child". In production this is the runc + // process's Stdout; here we hold a separate *os.File pointing at the + // same pipe write end via dup(). While this fd is open, closing pw2 + // alone in the parent does not cause pr2 to see EOF. + pw2Fd, err := dupFD(int(pw2.Fd())) + require.NoError(t, err) + pw2Dup := os.NewFile(uintptr(pw2Fd), "pw2-dup") + defer pw2Dup.Close() + + c := &conn{ + Reader: pr2, + Writer: pw1, + Closer: pw2, + } + + // Sanity: keep pr1 alive so the Writer side has a peer; otherwise + // the parent's first write would EPIPE in unrelated test paths. + _ = pr1 + + // Start a Read that will block until either data arrives on pr2 or + // pr2 is closed. In production this is the io.ReadFull goroutine + // inside readPrefaceWithTimeout. + readDone := make(chan error, 1) + go func() { + buf := make([]byte, 24) + _, err := io.ReadFull(c, buf) + readDone <- err + }() + + // Give the goroutine a moment to actually park on Read. + time.Sleep(50 * time.Millisecond) + + // Close the conn. The fix asserts that this closes the Reader fd + // (pr2) and unblocks io.ReadFull, even though the simulated child + // process (pw2Dup) still holds a writer-side fd open. + require.NoError(t, c.Close()) + + select { + case err := <-readDone: + // io.ReadFull returns either io.ErrUnexpectedEOF or a closed-pipe + // error depending on which side wins the close race; both are + // acceptable for the leak-fix assertion (the goroutine exited). + require.Error(t, err) + require.True(t, + errors.Is(err, io.ErrUnexpectedEOF) || + errors.Is(err, os.ErrClosed) || + errors.Is(err, io.EOF), + "expected close-pipe-style error, got %v", err) + case <-time.After(2 * time.Second): + t.Fatal("io.ReadFull goroutine still blocked 2s after conn.Close; goroutine-leak fix regressed") + } +} + +// TestConnCloseIsIdempotent asserts that calling Close more than once +// (which happens in production: defer lbf.conn.Close, the ctx.Done +// goroutine in serve, and read-error paths can all fire) does not +// panic and is safe. +func TestConnCloseIsIdempotent(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("os.Pipe semantics differ on windows") + } + + pr1, pw1, err := os.Pipe() + require.NoError(t, err) + pr2, pw2, err := os.Pipe() + require.NoError(t, err) + defer pr1.Close() + + c := &conn{ + Reader: pr2, + Writer: pw1, + Closer: pw2, + } + + require.NoError(t, c.Close()) + // Second close should not panic. os.File.Close on an already-closed + // file returns an error that callers (defer/ignore) can drop. + _ = c.Close() +} + +func TestPrefaceReadTimeoutEnvOverride(t *testing.T) { + t.Setenv(prefaceTimeoutEnvVar, "") + require.Equal(t, defaultPrefaceTimeout, prefaceReadTimeout()) + + t.Setenv(prefaceTimeoutEnvVar, "2m") + require.Equal(t, 2*time.Minute, prefaceReadTimeout()) + + // invalid value falls back to default + t.Setenv(prefaceTimeoutEnvVar, "not-a-duration") + require.Equal(t, defaultPrefaceTimeout, prefaceReadTimeout()) + + // non-positive falls back to default + t.Setenv(prefaceTimeoutEnvVar, "0s") + require.Equal(t, defaultPrefaceTimeout, prefaceReadTimeout()) +} diff --git a/go.mod b/go.mod index 2523057f9ce3..bd4c0dcca51e 100644 --- a/go.mod +++ b/go.mod @@ -97,6 +97,7 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.40.0 go.opentelemetry.io/otel/exporters/prometheus v0.60.0 + go.opentelemetry.io/otel/metric v1.40.0 go.opentelemetry.io/otel/sdk v1.40.0 go.opentelemetry.io/otel/sdk/metric v1.40.0 go.opentelemetry.io/otel/trace v1.40.0 @@ -223,7 +224,6 @@ require ( github.com/vishvananda/netns v0.0.5 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/otel/metric v1.40.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/term v0.41.0 // indirect