From 73f369d8ed9abeb4fa028718130ffc72d64756a5 Mon Sep 17 00:00:00 2001 From: Abhinav Singh Date: Wed, 20 May 2026 01:54:45 +0530 Subject: [PATCH 1/7] feat: add prometheus metrics for workload manager and router observability Signed-off-by: Abhinav Singh --- go.mod | 3 +- .../base/templates/agentcube-router.yaml | 4 + .../base/templates/workloadmanager.yaml | 4 + pkg/router/handlers.go | 21 ++++- pkg/router/metrics.go | 61 +++++++++++++ pkg/router/metrics_test.go | 63 +++++++++++++ pkg/router/server.go | 4 + pkg/router/session_manager.go | 2 + pkg/workloadmanager/garbage_collection.go | 21 +++++ pkg/workloadmanager/handlers.go | 60 +++++++++---- pkg/workloadmanager/handlers_test.go | 4 +- pkg/workloadmanager/metrics.go | 88 +++++++++++++++++++ pkg/workloadmanager/metrics_test.go | 73 +++++++++++++++ pkg/workloadmanager/server.go | 2 + 14 files changed, 385 insertions(+), 25 deletions(-) create mode 100644 pkg/router/metrics.go create mode 100644 pkg/router/metrics_test.go create mode 100644 pkg/workloadmanager/metrics.go create mode 100644 pkg/workloadmanager/metrics_test.go diff --git a/go.mod b/go.mod index 3cc399a1..ca9a1820 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/gin-gonic/gin v1.10.0 github.com/golang-jwt/jwt/v5 v5.2.2 github.com/google/uuid v1.6.0 + github.com/prometheus/client_golang v1.23.2 github.com/redis/go-redis/v9 v9.17.1 github.com/stretchr/testify v1.11.1 github.com/valkey-io/valkey-go v1.0.69 @@ -67,6 +68,7 @@ require ( github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/cpuid/v2 v2.2.7 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/leodido/go-urn v1.4.0 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect @@ -74,7 +76,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.1 // indirect github.com/prometheus/procfs v0.17.0 // indirect diff --git a/manifests/charts/base/templates/agentcube-router.yaml b/manifests/charts/base/templates/agentcube-router.yaml index 9e4410e6..abd12193 100644 --- a/manifests/charts/base/templates/agentcube-router.yaml +++ b/manifests/charts/base/templates/agentcube-router.yaml @@ -101,6 +101,10 @@ spec: apiVersion: v1 kind: Service metadata: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: {{ .Values.router.service.port | quote }} + prometheus.io/scrape: "true" name: agentcube-router namespace: {{ .Release.Namespace }} labels: diff --git a/manifests/charts/base/templates/workloadmanager.yaml b/manifests/charts/base/templates/workloadmanager.yaml index 5a02bc98..c4d547d1 100644 --- a/manifests/charts/base/templates/workloadmanager.yaml +++ b/manifests/charts/base/templates/workloadmanager.yaml @@ -97,6 +97,10 @@ spec: apiVersion: v1 kind: Service metadata: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: {{ .Values.workloadmanager.service.port | quote }} + prometheus.io/scrape: "true" name: workloadmanager namespace: {{ .Release.Namespace }} labels: diff --git a/pkg/router/handlers.go b/pkg/router/handlers.go index adbd0fdd..212b5d27 100644 --- a/pkg/router/handlers.go +++ b/pkg/router/handlers.go @@ -56,6 +56,11 @@ func (s *Server) handleHealthReady(c *gin.Context) { // handleInvoke is a private helper function that handles invocation requests for both agents and code interpreters func (s *Server) handleInvoke(c *gin.Context, namespace, name, path, kind string) { + start := time.Now() + defer func() { + routerRequestDuration.WithLabelValues(kind).Observe(time.Since(start).Seconds()) + }() + klog.V(4).Infof("%s invoke request: namespace=%s, name=%s, path=%s", kind, namespace, name, path) // Extract session ID from header @@ -201,14 +206,24 @@ func (s *Server) waitForUpstreamReachable(ctx context.Context, targetURL *url.UR func upstreamUnavailableResponse(err error) (int, gin.H) { errText := strings.ToLower(err.Error()) + var category string + var code int + var resp gin.H + switch { case connectionRefusedRetryable(err): - return http.StatusBadGateway, gin.H{"error": "sandbox unreachable"} + category = "connection_refused" + code, resp = http.StatusBadGateway, gin.H{"error": "sandbox unreachable"} case strings.Contains(errText, "deadline exceeded") || strings.Contains(errText, "timeout"): - return http.StatusGatewayTimeout, gin.H{"error": "sandbox timeout"} + category = "timeout" + code, resp = http.StatusGatewayTimeout, gin.H{"error": "sandbox timeout"} default: - return http.StatusServiceUnavailable, gin.H{"error": "sandbox unreachable"} + category = "other" + code, resp = http.StatusServiceUnavailable, gin.H{"error": "sandbox unreachable"} } + + routerProxyErrorsTotal.WithLabelValues(category).Inc() + return code, resp } func (s *Server) resolveSandboxTarget(c *gin.Context, sandbox *types.SandboxInfo, path string) (*url.URL, bool) { diff --git a/pkg/router/metrics.go b/pkg/router/metrics.go new file mode 100644 index 00000000..bf129a6c --- /dev/null +++ b/pkg/router/metrics.go @@ -0,0 +1,61 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package router + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + routerRequestDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "agentcube_router_request_duration_seconds", + Help: "Duration of proxy requests through the router in seconds.", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, + }, + []string{"kind"}, + ) + + routerProxyErrorsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "agentcube_router_proxy_errors_total", + Help: "Total number of proxy errors encountered by the router.", + }, + []string{"error_category"}, + ) + + routerConcurrentRequests = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "agentcube_router_concurrent_requests", + Help: "Current number of concurrent requests being processed by the router.", + }, + ) + + routerSessionCreateTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "agentcube_router_session_create_total", + Help: "Total number of sandbox sessions implicitly created via the router.", + }, + ) +) + +func init() { + prometheus.MustRegister(routerRequestDuration) + prometheus.MustRegister(routerProxyErrorsTotal) + prometheus.MustRegister(routerConcurrentRequests) + prometheus.MustRegister(routerSessionCreateTotal) +} diff --git a/pkg/router/metrics_test.go b/pkg/router/metrics_test.go new file mode 100644 index 00000000..e359eb5d --- /dev/null +++ b/pkg/router/metrics_test.go @@ -0,0 +1,63 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package router + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" +) + +func TestRouterMetricsRegistrationAndCollection(t *testing.T) { + // Reset/initialize counters to know starting points (metrics are global) + initialSessionCreates := testutil.ToFloat64(routerSessionCreateTotal) + routerSessionCreateTotal.Inc() + assert.Equal(t, initialSessionCreates+1, testutil.ToFloat64(routerSessionCreateTotal)) + + initialProxyErrors := testutil.ToFloat64(routerProxyErrorsTotal.WithLabelValues("connection_refused")) + routerProxyErrorsTotal.WithLabelValues("connection_refused").Inc() + assert.Equal(t, initialProxyErrors+1, testutil.ToFloat64(routerProxyErrorsTotal.WithLabelValues("connection_refused"))) + + // Gauge test + routerConcurrentRequests.Set(5.0) + assert.Equal(t, float64(5), testutil.ToFloat64(routerConcurrentRequests)) + routerConcurrentRequests.Set(0.0) + + // Histogram test + routerRequestDuration.WithLabelValues("AgentRuntime").Observe(0.12) +} + +func TestRouterMetricsRoute(t *testing.T) { + gin.SetMode(gin.TestMode) + server := &Server{config: &Config{MaxConcurrentRequests: 10}} + server.setupRoutes() + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodGet, "/metrics", nil) + server.engine.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + body := w.Body.String() + assert.True(t, strings.Contains(body, "agentcube_router_session_create_total")) + assert.True(t, strings.Contains(body, "agentcube_router_proxy_errors_total")) + assert.True(t, strings.Contains(body, "agentcube_router_concurrent_requests")) +} diff --git a/pkg/router/server.go b/pkg/router/server.go index 5bd19472..d97dde2e 100644 --- a/pkg/router/server.go +++ b/pkg/router/server.go @@ -23,6 +23,7 @@ import ( "time" "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus/promhttp" "k8s.io/klog/v2" "github.com/volcano-sh/agentcube/pkg/store" @@ -111,10 +112,12 @@ func (s *Server) concurrencyLimitMiddleware() gin.HandlerFunc { // Try to acquire a slot in the semaphore select { case concurrency <- struct{}{}: + routerConcurrentRequests.Inc() // Successfully acquired a slot, continue processing defer func() { // Release the slot when done <-concurrency + routerConcurrentRequests.Dec() }() c.Next() default: @@ -135,6 +138,7 @@ func (s *Server) setupRoutes() { // Health check endpoints (no authentication required, no concurrency limit) s.engine.GET("/health/live", s.handleHealthLive) s.engine.GET("/health/ready", s.handleHealthReady) + s.engine.GET("/metrics", gin.WrapH(promhttp.Handler())) // API v1 routes with concurrency limiting v1 := s.engine.Group("/v1") diff --git a/pkg/router/session_manager.go b/pkg/router/session_manager.go index 2644c210..569d1168 100644 --- a/pkg/router/session_manager.go +++ b/pkg/router/session_manager.go @@ -183,6 +183,8 @@ func (m *manager) createSandbox(ctx context.Context, namespace string, name stri return nil, api.NewInternalError(fmt.Errorf("response with empty session id from workload manager")) } + routerSessionCreateTotal.Inc() + // Construct Sandbox Info from response sandbox := &types.SandboxInfo{ Kind: res.Kind, diff --git a/pkg/workloadmanager/garbage_collection.go b/pkg/workloadmanager/garbage_collection.go index 0de41d18..7153e5c4 100644 --- a/pkg/workloadmanager/garbage_collection.go +++ b/pkg/workloadmanager/garbage_collection.go @@ -74,6 +74,11 @@ func (gc *garbageCollector) run(stopCh <-chan struct{}) { } func (gc *garbageCollector) once() { + start := time.Now() + defer func() { + gcCycleDuration.Observe(time.Since(start).Seconds()) + }() + ctx, cancel := context.WithTimeout(context.Background(), gcOnceTimeout) defer cancel() now := time.Now() @@ -84,11 +89,13 @@ func (gc *garbageCollector) once() { candidates, err := gc.storeClient.ListInactiveSandboxes(ctx, now.Add(-gcMinInactiveLookback), gcCandidateLimit) if err != nil { klog.Errorf("garbage collector error listing inactive sandboxes: %v", err) + gcErrorsTotal.Inc() } // Apply per-sandbox idle timeout: only include sandboxes whose own IdleTimeout // (stored in the session JSON) has actually elapsed since LastActivityAt. inactiveSandboxes := make([]*types.SandboxInfo, 0, len(candidates)) + reclaimReason := make(map[string]string) for _, s := range candidates { activityAt := s.LastActivityAt if activityAt.IsZero() { @@ -101,6 +108,7 @@ func (gc *garbageCollector) once() { } if activityAt.Add(idleTimeout).Before(now) { inactiveSandboxes = append(inactiveSandboxes, s) + reclaimReason[s.SessionID] = "inactive" } } @@ -108,7 +116,12 @@ func (gc *garbageCollector) once() { expiredSandboxes, err := gc.storeClient.ListExpiredSandboxes(ctx, now, gcCandidateLimit) if err != nil { klog.Errorf("garbage collector error listing expired sandboxes: %v", err) + gcErrorsTotal.Inc() + } + for _, s := range expiredSandboxes { + reclaimReason[s.SessionID] = "expired" } + // Merge and deduplicate: a sandbox may appear in both lists when it is // simultaneously idle-timed-out and past its TTL. gcSandboxes := deduplicateSandboxes(inactiveSandboxes, expiredSandboxes) @@ -127,12 +140,20 @@ func (gc *garbageCollector) once() { } if err != nil { errs = append(errs, err) + gcErrorsTotal.Inc() continue } klog.Infof("garbage collector %s %s/%s session %s deleted", gcSandbox.Kind, gcSandbox.SandboxNamespace, gcSandbox.Name, gcSandbox.SessionID) err = gc.storeClient.DeleteSandboxBySessionID(ctx, gcSandbox.SessionID) if err != nil { errs = append(errs, err) + gcErrorsTotal.Inc() + } else { + reason := reclaimReason[gcSandbox.SessionID] + if reason == "" { + reason = "inactive" + } + gcSandboxesReclaimedTotal.WithLabelValues(reason).Inc() } } err = utilerrors.NewAggregate(errs) diff --git a/pkg/workloadmanager/handlers.go b/pkg/workloadmanager/handlers.go index 7d417c6e..313e1a06 100644 --- a/pkg/workloadmanager/handlers.go +++ b/pkg/workloadmanager/handlers.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "net/http" + "strings" "time" "github.com/gin-gonic/gin" @@ -141,7 +142,7 @@ func (s *Server) handleSandboxCreate(c *gin.Context, kind string) { // Ensure cleanup is called when function returns to prevent memory leak defer s.sandboxController.UnWatchSandbox(namespace, sandboxName) - response, err := s.createSandbox(c.Request.Context(), dynamicClient, sandbox, sandboxClaim, sandboxEntry, resultChan) + response, err := s.createSandbox(c.Request.Context(), dynamicClient, sandbox, sandboxClaim, sandboxEntry, resultChan, sandboxReq.Kind) if err != nil { // Client disconnected — abort with 499 so logs/metrics reflect the cancellation. if errors.Is(err, context.Canceled) { @@ -196,7 +197,17 @@ func (s *Server) createK8sResources(ctx context.Context, dynamicClient dynamic.I } // createSandbox performs sandbox creation and returns the response payload or an error with an HTTP status code. -func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interface, sandbox *sandboxv1alpha1.Sandbox, sandboxClaim *extensionsv1alpha1.SandboxClaim, sandboxEntry *sandboxEntry, resultChan <-chan SandboxStatusUpdate) (*types.CreateSandboxResponse, error) { +func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interface, sandbox *sandboxv1alpha1.Sandbox, sandboxClaim *extensionsv1alpha1.SandboxClaim, sandboxEntry *sandboxEntry, resultChan <-chan SandboxStatusUpdate, kind string) (res *types.CreateSandboxResponse, err error) { + start := time.Now() + defer func() { + status := "success" + if err != nil { + status = "failure" + } + sandboxCreateDuration.WithLabelValues(kind, status).Observe(time.Since(start).Seconds()) + sandboxCreateTotal.WithLabelValues(kind, status).Inc() + }() + placeholder := buildSandboxPlaceHolder(sandbox, sandboxEntry) if err := s.storeClient.StoreSandbox(ctx, placeholder); err != nil { if isContextError(err) { @@ -212,6 +223,7 @@ func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interf if !needRollbackSandbox { return } + sandboxRollbackTotal.Inc() s.rollbackSandboxCreation(dynamicClient, sandbox, sandboxClaim, sandboxEntry.SessionID) }() @@ -219,23 +231,9 @@ func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interf return nil, err } - // Use NewTimer so we can stop it explicitly when another branch wins, - // preventing the runtime from retaining the timer until it fires. - timer := time.NewTimer(2 * time.Minute) // consistent with router settings - - var createdSandbox *sandboxv1alpha1.Sandbox - select { - case result := <-resultChan: - timer.Stop() - createdSandbox = result.Sandbox - klog.V(2).Infof("sandbox %s/%s reported ready, verifying entrypoints", createdSandbox.Namespace, createdSandbox.Name) - case <-ctx.Done(): - timer.Stop() - klog.Warningf("sandbox %s/%s wait canceled: %v", sandbox.Namespace, sandbox.Name, ctx.Err()) - return nil, ctx.Err() - case <-timer.C: - klog.Warningf("sandbox %s/%s create timed out", sandbox.Namespace, sandbox.Name) - return nil, errSandboxCreationTimeout + createdSandbox, err := s.waitForSandboxReady(ctx, sandbox.Name, sandbox.Namespace, resultChan) + if err != nil { + return nil, err } // agent-sandbox create pod with same name as sandbox if no warmpool is used @@ -284,6 +282,23 @@ func (s *Server) createSandbox(ctx context.Context, dynamicClient dynamic.Interf return response, nil } +func (s *Server) waitForSandboxReady(ctx context.Context, sandboxName, sandboxNamespace string, resultChan <-chan SandboxStatusUpdate) (*sandboxv1alpha1.Sandbox, error) { + timer := time.NewTimer(2 * time.Minute) + defer timer.Stop() + + select { + case result := <-resultChan: + klog.V(2).Infof("sandbox %s/%s reported ready, verifying entrypoints", result.Sandbox.Namespace, result.Sandbox.Name) + return result.Sandbox, nil + case <-ctx.Done(): + klog.Warningf("sandbox %s/%s wait canceled: %v", sandboxNamespace, sandboxName, ctx.Err()) + return nil, ctx.Err() + case <-timer.C: + klog.Warningf("sandbox %s/%s create timed out", sandboxNamespace, sandboxName) + return nil, errSandboxCreationTimeout + } +} + // rollbackSandboxCreation deletes the sandbox (or sandbox claim) and its store // placeholder when creation fails. It runs in a fresh context so that a // canceled request context does not prevent cleanup. @@ -310,6 +325,11 @@ func (s *Server) rollbackSandboxCreation(dynamicClient dynamic.Interface, sandbo // handleDeleteSandbox handles sandbox deletion requests func (s *Server) handleDeleteSandbox(c *gin.Context) { + kind := types.AgentRuntimeKind + if strings.Contains(c.Request.URL.Path, "code-interpreter") { + kind = types.CodeInterpreterKind + } + sessionID := c.Param("sessionId") // Query sandbox from store sandbox, err := s.storeClient.GetSandboxBySessionID(c.Request.Context(), sessionID) @@ -372,6 +392,8 @@ func (s *Server) handleDeleteSandbox(c *gin.Context) { return } + sandboxDeleteTotal.WithLabelValues(kind).Inc() + klog.Infof("delete %s %s/%s successfully, sessionID: %v ", sandbox.Kind, sandbox.SandboxNamespace, sandbox.Name, sandbox.SessionID) respondJSON(c, http.StatusOK, map[string]string{ "message": "Sandbox deleted successfully", diff --git a/pkg/workloadmanager/handlers_test.go b/pkg/workloadmanager/handlers_test.go index 5a31d7c0..2addbc67 100644 --- a/pkg/workloadmanager/handlers_test.go +++ b/pkg/workloadmanager/handlers_test.go @@ -242,7 +242,7 @@ func TestServerCreateSandbox(t *testing.T) { claim = &extensionsv1alpha1.SandboxClaim{ObjectMeta: metav1.ObjectMeta{Name: sb.Name, Namespace: sb.Namespace}} } - resp, err := server.createSandbox(context.Background(), nil, sb, claim, makeEntry(), resultChan) + resp, err := server.createSandbox(context.Background(), nil, sb, claim, makeEntry(), resultChan, "AgentRuntime") require.Equal(t, tt.expectCreateCalls, createCalls, "createSandbox call count") require.Equal(t, tt.expectClaimCalls, claimCalls, "createSandboxClaim call count") @@ -442,7 +442,7 @@ func TestHandleSandboxCreate(t *testing.T) { }) createCalls := 0 - patches.ApplyPrivateMethod(reflect.TypeOf(fakeServer), "createSandbox", func(_ *Server, _ context.Context, _ dynamic.Interface, _ *sandboxv1alpha1.Sandbox, _ *extensionsv1alpha1.SandboxClaim, _ *sandboxEntry, _ <-chan SandboxStatusUpdate) (*types.CreateSandboxResponse, error) { + patches.ApplyPrivateMethod(reflect.TypeOf(fakeServer), "createSandbox", func(_ *Server, _ context.Context, _ dynamic.Interface, _ *sandboxv1alpha1.Sandbox, _ *extensionsv1alpha1.SandboxClaim, _ *sandboxEntry, _ <-chan SandboxStatusUpdate, _ string) (*types.CreateSandboxResponse, error) { createCalls++ if tc.createErr != nil { return nil, tc.createErr diff --git a/pkg/workloadmanager/metrics.go b/pkg/workloadmanager/metrics.go new file mode 100644 index 00000000..6cc47bf7 --- /dev/null +++ b/pkg/workloadmanager/metrics.go @@ -0,0 +1,88 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package workloadmanager + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + sandboxCreateDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "agentcube_sandbox_create_duration_seconds", + Help: "Duration of sandbox creation in seconds.", + Buckets: []float64{1, 2, 5, 10, 20, 30, 45, 60, 90, 120, 150, 180}, + }, + []string{"kind", "status"}, + ) + + sandboxCreateTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "agentcube_sandbox_create_total", + Help: "Total number of sandboxes created.", + }, + []string{"kind", "status"}, + ) + + sandboxDeleteTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "agentcube_sandbox_delete_total", + Help: "Total number of sandboxes deleted.", + }, + []string{"kind"}, + ) + + sandboxRollbackTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "agentcube_sandbox_rollback_total", + Help: "Total number of sandbox creation rollbacks.", + }, + ) + + gcCycleDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "agentcube_gc_cycle_duration_seconds", + Help: "Duration of a GC cycle in seconds.", + Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10, 30, 60}, + }, + ) + + gcSandboxesReclaimedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "agentcube_gc_sandboxes_reclaimed_total", + Help: "Total number of sandboxes reclaimed by the garbage collector.", + }, + []string{"reason"}, + ) + + gcErrorsTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "agentcube_gc_errors_total", + Help: "Total number of garbage collection errors.", + }, + ) +) + +func init() { + prometheus.MustRegister(sandboxCreateDuration) + prometheus.MustRegister(sandboxCreateTotal) + prometheus.MustRegister(sandboxDeleteTotal) + prometheus.MustRegister(sandboxRollbackTotal) + prometheus.MustRegister(gcCycleDuration) + prometheus.MustRegister(gcSandboxesReclaimedTotal) + prometheus.MustRegister(gcErrorsTotal) +} diff --git a/pkg/workloadmanager/metrics_test.go b/pkg/workloadmanager/metrics_test.go new file mode 100644 index 00000000..c7f3b5dd --- /dev/null +++ b/pkg/workloadmanager/metrics_test.go @@ -0,0 +1,73 @@ +/* +Copyright The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package workloadmanager + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" +) + +func TestMetricsRegistrationAndCollection(t *testing.T) { + // Reset/initialize counters to know starting points (metrics are global) + initialRollbacks := testutil.ToFloat64(sandboxRollbackTotal) + sandboxRollbackTotal.Inc() + assert.Equal(t, initialRollbacks+1, testutil.ToFloat64(sandboxRollbackTotal)) + + initialCreates := testutil.ToFloat64(sandboxCreateTotal.WithLabelValues("AgentRuntime", "success")) + sandboxCreateTotal.WithLabelValues("AgentRuntime", "success").Inc() + assert.Equal(t, initialCreates+1, testutil.ToFloat64(sandboxCreateTotal.WithLabelValues("AgentRuntime", "success"))) + + initialDeletes := testutil.ToFloat64(sandboxDeleteTotal.WithLabelValues("CodeInterpreter")) + sandboxDeleteTotal.WithLabelValues("CodeInterpreter").Inc() + assert.Equal(t, initialDeletes+1, testutil.ToFloat64(sandboxDeleteTotal.WithLabelValues("CodeInterpreter"))) + + // Histogram test + sandboxCreateDuration.WithLabelValues("AgentRuntime", "success").Observe(1.5) + + // Garbage Collection metrics + gcCycleDuration.Observe(0.5) + + initialReclaimed := testutil.ToFloat64(gcSandboxesReclaimedTotal.WithLabelValues("expired")) + gcSandboxesReclaimedTotal.WithLabelValues("expired").Inc() + assert.Equal(t, initialReclaimed+1, testutil.ToFloat64(gcSandboxesReclaimedTotal.WithLabelValues("expired"))) + + initialGcErrors := testutil.ToFloat64(gcErrorsTotal) + gcErrorsTotal.Inc() + assert.Equal(t, initialGcErrors+1, testutil.ToFloat64(gcErrorsTotal)) +} + +func TestMetricsRoute(t *testing.T) { + gin.SetMode(gin.TestMode) + server := &Server{} + server.setupRoutes() + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodGet, "/metrics", nil) + server.router.ServeHTTP(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + body := w.Body.String() + assert.True(t, strings.Contains(body, "agentcube_sandbox_rollback_total")) + assert.True(t, strings.Contains(body, "agentcube_sandbox_create_total")) + assert.True(t, strings.Contains(body, "agentcube_gc_errors_total")) +} diff --git a/pkg/workloadmanager/server.go b/pkg/workloadmanager/server.go index 53b10dfb..1c78db7c 100644 --- a/pkg/workloadmanager/server.go +++ b/pkg/workloadmanager/server.go @@ -24,6 +24,7 @@ import ( "time" "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus/promhttp" "k8s.io/klog/v2" "github.com/volcano-sh/agentcube/pkg/store" @@ -110,6 +111,7 @@ func (s *Server) setupRoutes() { // Health check (no authentication required) s.router.GET("/health", s.handleHealth) + s.router.GET("/metrics", gin.WrapH(promhttp.Handler())) // API v1 routes v1Group := s.router.Group("/v1") From d0f9344d1af325bc53c3aef5c6a2865d0da041c7 Mon Sep 17 00:00:00 2001 From: Abhinav Singh Date: Sat, 23 May 2026 14:37:47 +0530 Subject: [PATCH 2/7] fix: use sandbox.Kind for sandboxDeleteTotal label Signed-off-by: Abhinav Singh --- pkg/workloadmanager/handlers.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pkg/workloadmanager/handlers.go b/pkg/workloadmanager/handlers.go index 313e1a06..cf541e14 100644 --- a/pkg/workloadmanager/handlers.go +++ b/pkg/workloadmanager/handlers.go @@ -21,7 +21,6 @@ import ( "errors" "fmt" "net/http" - "strings" "time" "github.com/gin-gonic/gin" @@ -325,11 +324,6 @@ func (s *Server) rollbackSandboxCreation(dynamicClient dynamic.Interface, sandbo // handleDeleteSandbox handles sandbox deletion requests func (s *Server) handleDeleteSandbox(c *gin.Context) { - kind := types.AgentRuntimeKind - if strings.Contains(c.Request.URL.Path, "code-interpreter") { - kind = types.CodeInterpreterKind - } - sessionID := c.Param("sessionId") // Query sandbox from store sandbox, err := s.storeClient.GetSandboxBySessionID(c.Request.Context(), sessionID) @@ -342,6 +336,8 @@ func (s *Server) handleDeleteSandbox(c *gin.Context) { respondError(c, http.StatusInternalServerError, "internal server error") return } + + kind := sandbox.Kind dynamicClient := s.k8sClient.dynamicClient if s.config.EnableAuth { From d3bb797be849be3c6aa9d558d803594012312bf9 Mon Sep 17 00:00:00 2001 From: Abhinav Singh Date: Sat, 23 May 2026 14:46:07 +0530 Subject: [PATCH 3/7] fix: address review feedback on metrics tests and whitespace Signed-off-by: Abhinav Singh --- pkg/router/metrics_test.go | 1 + pkg/workloadmanager/handlers.go | 2 +- pkg/workloadmanager/metrics_test.go | 5 +++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pkg/router/metrics_test.go b/pkg/router/metrics_test.go index e359eb5d..dac1a4f4 100644 --- a/pkg/router/metrics_test.go +++ b/pkg/router/metrics_test.go @@ -50,6 +50,7 @@ func TestRouterMetricsRoute(t *testing.T) { gin.SetMode(gin.TestMode) server := &Server{config: &Config{MaxConcurrentRequests: 10}} server.setupRoutes() + routerProxyErrorsTotal.WithLabelValues("other").Add(0) w := httptest.NewRecorder() req, _ := http.NewRequest(http.MethodGet, "/metrics", nil) diff --git a/pkg/workloadmanager/handlers.go b/pkg/workloadmanager/handlers.go index cf541e14..1c1a5417 100644 --- a/pkg/workloadmanager/handlers.go +++ b/pkg/workloadmanager/handlers.go @@ -336,7 +336,7 @@ func (s *Server) handleDeleteSandbox(c *gin.Context) { respondError(c, http.StatusInternalServerError, "internal server error") return } - + kind := sandbox.Kind dynamicClient := s.k8sClient.dynamicClient diff --git a/pkg/workloadmanager/metrics_test.go b/pkg/workloadmanager/metrics_test.go index c7f3b5dd..1448081e 100644 --- a/pkg/workloadmanager/metrics_test.go +++ b/pkg/workloadmanager/metrics_test.go @@ -43,10 +43,10 @@ func TestMetricsRegistrationAndCollection(t *testing.T) { // Histogram test sandboxCreateDuration.WithLabelValues("AgentRuntime", "success").Observe(1.5) - + // Garbage Collection metrics gcCycleDuration.Observe(0.5) - + initialReclaimed := testutil.ToFloat64(gcSandboxesReclaimedTotal.WithLabelValues("expired")) gcSandboxesReclaimedTotal.WithLabelValues("expired").Inc() assert.Equal(t, initialReclaimed+1, testutil.ToFloat64(gcSandboxesReclaimedTotal.WithLabelValues("expired"))) @@ -60,6 +60,7 @@ func TestMetricsRoute(t *testing.T) { gin.SetMode(gin.TestMode) server := &Server{} server.setupRoutes() + sandboxCreateTotal.WithLabelValues("AgentRuntime", "success").Add(0) w := httptest.NewRecorder() req, _ := http.NewRequest(http.MethodGet, "/metrics", nil) From dfc1ddc3c700d8622eea9c7a7ad938b39f0712b1 Mon Sep 17 00:00:00 2001 From: Abhinav Singh Date: Sat, 23 May 2026 14:52:01 +0530 Subject: [PATCH 4/7] ci: retrigger coverage check Signed-off-by: Abhinav Singh From b7272f89e51d0be10f6ab9d6f615350ba28176d2 Mon Sep 17 00:00:00 2001 From: Abhinav Singh Date: Sat, 23 May 2026 16:30:19 +0530 Subject: [PATCH 5/7] ci: retrigger CI after runner network failure Signed-off-by: Abhinav Singh From 5c2eec35fdd9990fc4a82cd1185621ba9f858a4f Mon Sep 17 00:00:00 2001 From: Abhinav Singh Date: Sat, 23 May 2026 18:01:53 +0530 Subject: [PATCH 6/7] ci: retrigger CI after credentials error Signed-off-by: Abhinav Singh From 80f325d39c8e9a64f0b505c3526c5d26f5f5b07a Mon Sep 17 00:00:00 2001 From: Abhinav Singh Date: Mon, 25 May 2026 15:20:26 +0530 Subject: [PATCH 7/7] refactor: migrate metrics registration from init()/MustRegister to promauto Signed-off-by: Abhinav Singh --- pkg/router/metrics.go | 16 +++++----------- pkg/workloadmanager/metrics.go | 25 ++++++++----------------- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/pkg/router/metrics.go b/pkg/router/metrics.go index bf129a6c..db1f12f1 100644 --- a/pkg/router/metrics.go +++ b/pkg/router/metrics.go @@ -18,10 +18,11 @@ package router import ( "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" ) var ( - routerRequestDuration = prometheus.NewHistogramVec( + routerRequestDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "agentcube_router_request_duration_seconds", Help: "Duration of proxy requests through the router in seconds.", @@ -30,7 +31,7 @@ var ( []string{"kind"}, ) - routerProxyErrorsTotal = prometheus.NewCounterVec( + routerProxyErrorsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "agentcube_router_proxy_errors_total", Help: "Total number of proxy errors encountered by the router.", @@ -38,24 +39,17 @@ var ( []string{"error_category"}, ) - routerConcurrentRequests = prometheus.NewGauge( + routerConcurrentRequests = promauto.NewGauge( prometheus.GaugeOpts{ Name: "agentcube_router_concurrent_requests", Help: "Current number of concurrent requests being processed by the router.", }, ) - routerSessionCreateTotal = prometheus.NewCounter( + routerSessionCreateTotal = promauto.NewCounter( prometheus.CounterOpts{ Name: "agentcube_router_session_create_total", Help: "Total number of sandbox sessions implicitly created via the router.", }, ) ) - -func init() { - prometheus.MustRegister(routerRequestDuration) - prometheus.MustRegister(routerProxyErrorsTotal) - prometheus.MustRegister(routerConcurrentRequests) - prometheus.MustRegister(routerSessionCreateTotal) -} diff --git a/pkg/workloadmanager/metrics.go b/pkg/workloadmanager/metrics.go index 6cc47bf7..d0cc8138 100644 --- a/pkg/workloadmanager/metrics.go +++ b/pkg/workloadmanager/metrics.go @@ -18,10 +18,11 @@ package workloadmanager import ( "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" ) var ( - sandboxCreateDuration = prometheus.NewHistogramVec( + sandboxCreateDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "agentcube_sandbox_create_duration_seconds", Help: "Duration of sandbox creation in seconds.", @@ -30,7 +31,7 @@ var ( []string{"kind", "status"}, ) - sandboxCreateTotal = prometheus.NewCounterVec( + sandboxCreateTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "agentcube_sandbox_create_total", Help: "Total number of sandboxes created.", @@ -38,7 +39,7 @@ var ( []string{"kind", "status"}, ) - sandboxDeleteTotal = prometheus.NewCounterVec( + sandboxDeleteTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "agentcube_sandbox_delete_total", Help: "Total number of sandboxes deleted.", @@ -46,14 +47,14 @@ var ( []string{"kind"}, ) - sandboxRollbackTotal = prometheus.NewCounter( + sandboxRollbackTotal = promauto.NewCounter( prometheus.CounterOpts{ Name: "agentcube_sandbox_rollback_total", Help: "Total number of sandbox creation rollbacks.", }, ) - gcCycleDuration = prometheus.NewHistogram( + gcCycleDuration = promauto.NewHistogram( prometheus.HistogramOpts{ Name: "agentcube_gc_cycle_duration_seconds", Help: "Duration of a GC cycle in seconds.", @@ -61,7 +62,7 @@ var ( }, ) - gcSandboxesReclaimedTotal = prometheus.NewCounterVec( + gcSandboxesReclaimedTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "agentcube_gc_sandboxes_reclaimed_total", Help: "Total number of sandboxes reclaimed by the garbage collector.", @@ -69,20 +70,10 @@ var ( []string{"reason"}, ) - gcErrorsTotal = prometheus.NewCounter( + gcErrorsTotal = promauto.NewCounter( prometheus.CounterOpts{ Name: "agentcube_gc_errors_total", Help: "Total number of garbage collection errors.", }, ) ) - -func init() { - prometheus.MustRegister(sandboxCreateDuration) - prometheus.MustRegister(sandboxCreateTotal) - prometheus.MustRegister(sandboxDeleteTotal) - prometheus.MustRegister(sandboxRollbackTotal) - prometheus.MustRegister(gcCycleDuration) - prometheus.MustRegister(gcSandboxesReclaimedTotal) - prometheus.MustRegister(gcErrorsTotal) -}