diff --git a/.gitignore b/.gitignore index 12b27f668..4facf47e5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ vendor/ apiserver.local.config/ /apiserver/porch *.log +*.csv +load_test_results.txt .env # Development artifact path diff --git a/cmd/porch/main.go b/cmd/porch/main.go index dff4d66d1..cac2f5dde 100644 --- a/cmd/porch/main.go +++ b/cmd/porch/main.go @@ -16,7 +16,9 @@ package main import ( "os" + "time" + "github.com/nephio-project/porch/internal/metrics" porchotel "github.com/nephio-project/porch/internal/otel" "github.com/nephio-project/porch/pkg/cmd/server" genericapiserver "k8s.io/apiserver/pkg/server" @@ -34,12 +36,26 @@ func main() { func run() int { log.SetLogger(zap.New(zap.UseDevMode(true))) ctx := genericapiserver.SetupSignalContext() - err := porchotel.SetupOpenTelemetry(ctx) + otelResources, err := porchotel.SetupOpenTelemetry(ctx) if err != nil { genericapiserver.RequestShutdown() klog.Errorf("%v\n", err) return 1 } + defer func() { + if err := otelResources.ShutdownWithTimeout(10 * time.Second); err != nil { + klog.Warningf("failed to gracefully shutdown OpenTelemetry: %v", err) + } + }() + + prof := &metrics.Profiling{} + prof.Start() + defer prof.Stop() + + pyro := &metrics.PyroscopeProfiling{} + pyro.Start() + defer pyro.Stop() + options := server.NewPorchServerOptions(os.Stdout, os.Stderr) cmd := server.NewCommandStartPorchServer(ctx, options) code := cli.Run(cmd) diff --git a/controllers/main.go b/controllers/main.go index 24ed14c21..2baf7dffd 100644 --- a/controllers/main.go +++ b/controllers/main.go @@ -25,7 +25,9 @@ import ( "net/http" "os" "strings" + "time" + "github.com/nephio-project/porch/internal/metrics" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" @@ -122,6 +124,7 @@ func run(ctx context.Context) error { if err != nil { return err } + metrics.InitMetrics() mgr, err := newManager(ctx, scheme) if err != nil { @@ -203,9 +206,15 @@ func newManager(ctx context.Context, scheme *runtime.Scheme) (ctrl.Manager, erro } otel.SetLogger(klog.NewKlogr()) - if err := porchotel.SetupOpenTelemetry(ctx); err != nil { + otelResources, err := porchotel.SetupOpenTelemetry(ctx) + if err != nil { return nil, fmt.Errorf("error setting up OpenTelemetry: %w", err) } + defer func() { + if shutdownErr := otelResources.ShutdownWithTimeout(10 * time.Second); shutdownErr != nil { + klog.Warningf("failed to gracefully shutdown OpenTelemetry: %v", shutdownErr) + } + }() mgr, err := ctrl.NewManager(cfg, ctrl.Options{ Scheme: scheme, @@ -323,7 +332,6 @@ func setupFunctionConfigReconciler(mgr ctrl.Manager) (*reconciler.FunctionConfig return functionConfigStore, nil } - // --- Helpers --- func buildReconcilerMap(reconcilers ...Reconciler) map[string]Reconciler { diff --git a/controllers/packagerevisions/pkg/controllers/packagerevision/render.go b/controllers/packagerevisions/pkg/controllers/packagerevision/render.go index 6310ae0f9..a64e83249 100644 --- a/controllers/packagerevisions/pkg/controllers/packagerevision/render.go +++ b/controllers/packagerevisions/pkg/controllers/packagerevision/render.go @@ -150,7 +150,6 @@ func renderTrigger(pr *porchv1alpha2.PackageRevision) (requested string, annotat return } - // isRenderStale returns true if the annotation changed during render. func isRenderStale(currentAnnotation, rendered string) bool { return currentAnnotation != rendered diff --git a/controllers/repositories/pkg/controllers/repository/repository_controller.go b/controllers/repositories/pkg/controllers/repository/repository_controller.go index 0f402dec9..b5a0db357 100644 --- a/controllers/repositories/pkg/controllers/repository/repository_controller.go +++ b/controllers/repositories/pkg/controllers/repository/repository_controller.go @@ -67,7 +67,7 @@ type RepositoryReconciler struct { // Private implementation details syncLimiter chan struct{} // Semaphore for sync concurrency - coldStartRepos sync.Map // Tracks repos that have synced since startup + coldStartRepos sync.Map // Tracks repos that have synced since startup } //go:generate go run sigs.k8s.io/controller-tools/cmd/controller-gen@v0.19.0 rbac:headerFile=../../../../../scripts/boilerplate.yaml.txt,roleName=porch-controllers-repositories,year=$YEAR_GEN webhook paths="." output:rbac:artifacts:config=../../../config/rbac diff --git a/deployments/metrics-resources/grafana-perf-test-dashboard.json b/deployments/metrics-resources/grafana-perf-test-dashboard.json new file mode 100644 index 000000000..c986ce3ff --- /dev/null +++ b/deployments/metrics-resources/grafana-perf-test-dashboard.json @@ -0,0 +1,2358 @@ +{ + "title": "Porch Performance Test Dashboard", + "tags": [ + "porch", + "performance", + "metrics" + ], + "timezone": "browser", + "schemaVersion": 16, + "version": 0, + "refresh": "5s", + "time": { + "from": "now-5m", + "to": "now" + }, + "panels": [ + { + "id": 1, + "title": "GITEA-REPO-CREATE - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"GITEA-REPO-CREATE\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"GITEA-REPO-CREATE\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"GITEA-REPO-CREATE\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"GITEA-REPO-CREATE\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "percentunit", + "label": "Success Rate", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 2, + "title": "GITEA-REPO-CREATE - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"GITEA-REPO-CREATE\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"GITEA-REPO-CREATE\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"GITEA-REPO-CREATE\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 3, + "title": "PORCH-REPO-CREATE - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 6 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"PORCH-REPO-CREATE\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"PORCH-REPO-CREATE\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"PORCH-REPO-CREATE\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"PORCH-REPO-CREATE\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 4, + "title": "PORCH-REPO-CREATE - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 6 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"PORCH-REPO-CREATE\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"PORCH-REPO-CREATE\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"PORCH-REPO-CREATE\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 5, + "title": "REPO-WAIT - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"REPO-WAIT\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"REPO-WAIT\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"REPO-WAIT\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"REPO-WAIT\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 6, + "title": "REPO-WAIT - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"REPO-WAIT\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"REPO-WAIT\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"REPO-WAIT\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 7, + "title": "LIST - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"LIST\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"LIST\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"LIST\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"LIST\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 8, + "title": "LIST - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 18 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"LIST\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"LIST\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"LIST\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 9, + "title": "GET - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 24 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"GET\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"GET\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"GET\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"GET\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 10, + "title": "GET - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 24 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"GET\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"GET\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"GET\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 11, + "title": "GET-PROPOSED - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 30 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"GET-PROPOSED\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"GET-PROPOSED\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"GET-PROPOSED\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"GET-PROPOSED\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 12, + "title": "GET-PROPOSED - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 30 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"GET-PROPOSED\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"GET-PROPOSED\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"GET-PROPOSED\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 13, + "title": "GET-RESOURCES - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 36 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"GET-RESOURCES\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"GET-RESOURCES\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"GET-RESOURCES\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"GET-RESOURCES\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 14, + "title": "GET-RESOURCES - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 36 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"GET-RESOURCES\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"GET-RESOURCES\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"GET-RESOURCES\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 15, + "title": "CREATE - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 42 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"CREATE\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"CREATE\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"CREATE\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"CREATE\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 16, + "title": "CREATE - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 42 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"CREATE\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"CREATE\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"CREATE\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 19, + "title": "UPDATE - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 54 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"UPDATE\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"UPDATE\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"UPDATE\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"UPDATE\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 20, + "title": "UPDATE - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 54 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"UPDATE\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"UPDATE\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"UPDATE\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 23, + "title": "PROPOSE - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 66 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"PROPOSE\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"PROPOSE\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"PROPOSE\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"PROPOSE\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 24, + "title": "PROPOSE - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 66 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"PROPOSE\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"PROPOSE\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"PROPOSE\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 25, + "title": "APPROVE - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 72 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"APPROVE\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"APPROVE\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"APPROVE\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"APPROVE\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 26, + "title": "APPROVE - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 72 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"APPROVE\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"APPROVE\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"APPROVE\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 27, + "title": "PROPOSE-DELETION - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 78 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"PROPOSE-DELETION\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"PROPOSE-DELETION\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"PROPOSE-DELETION\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"PROPOSE-DELETION\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 28, + "title": "PROPOSE-DELETION - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 78 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"PROPOSE-DELETION\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"PROPOSE-DELETION\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"PROPOSE-DELETION\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 29, + "title": "DELETE - Duration & Success Rate", + "type": "graph", + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 84 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"DELETE\"}[5m])))", + "legendFormat": "p95 duration", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_operation_duration_seconds_sum{operation=\"DELETE\"}[5m])) / sum(rate(porch_perf_operation_duration_seconds_count{operation=\"DELETE\"}[5m]))", + "legendFormat": "avg duration", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum by (le) (rate(porch_perf_operation_duration_seconds_bucket{operation=\"DELETE\"}[5m])))", + "legendFormat": "p99 duration", + "refId": "C" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 30, + "title": "DELETE - Total & Success Rate", + "type": "stat", + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 84 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"DELETE\"})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{operation=\"DELETE\",status=\"success\"}) / sum(porch_perf_operations_total{operation=\"DELETE\"})", + "refId": "B" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "displayName", + "value": "Success Rate" + }, + { + "id": "unit", + "value": "percentunit" + } + ] + } + ] + } + }, + { + "id": 31, + "title": "Overall Success Rate by Operation", + "type": "graph", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 90 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total{status=\"success\"}) by (operation) / sum(porch_perf_operations_total) by (operation)", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "yaxes": [ + { + "format": "percentunit", + "label": "Success Rate", + "min": 0, + "max": 1, + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 32, + "title": "All Operations - Avg Duration Comparison", + "type": "graph", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 98 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (operation) (rate(porch_perf_operation_duration_seconds_sum[5m])) / sum by (operation) (rate(porch_perf_operation_duration_seconds_count[5m]))", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Avg Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 33, + "title": "All Operations - Total Count", + "type": "graph", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 98 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_operations_total) by (operation)", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "yaxes": [ + { + "format": "short", + "label": "Total Operations", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 34, + "title": "Lifecycle Transition Duration", + "type": "graph", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 106 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(porch_perf_lifecycle_transition_duration_seconds_bucket{from_state=\"Draft\", to_state=\"Proposed\"}[5m])) by (le))", + "legendFormat": "Draft -> Proposed (p99)", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_lifecycle_transition_duration_seconds_sum{from_state=\"Draft\", to_state=\"Proposed\"}[5m])) / sum(rate(porch_perf_lifecycle_transition_duration_seconds_count{from_state=\"Draft\", to_state=\"Proposed\"}[5m]))", + "legendFormat": "Draft -> Proposed (avg)", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(porch_perf_lifecycle_transition_duration_seconds_bucket{from_state=\"Proposed\", to_state=\"Published\"}[5m])) by (le))", + "legendFormat": "Proposed -> Published (p99)", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_lifecycle_transition_duration_seconds_sum{from_state=\"Proposed\", to_state=\"Published\"}[5m])) / sum(rate(porch_perf_lifecycle_transition_duration_seconds_count{from_state=\"Proposed\", to_state=\"Published\"}[5m]))", + "legendFormat": "Proposed -> Published (avg)", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(porch_perf_lifecycle_transition_duration_seconds_bucket{from_state=\"Published\", to_state=\"DeletionProposed\"}[5m])) by (le))", + "legendFormat": "Published -> DeletionProposed (p99)", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_lifecycle_transition_duration_seconds_sum{from_state=\"Published\", to_state=\"DeletionProposed\"}[5m])) / sum(rate(porch_perf_lifecycle_transition_duration_seconds_count{from_state=\"Published\", to_state=\"DeletionProposed\"}[5m]))", + "legendFormat": "Published -> DeletionProposed (avg)", + "refId": "G" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(porch_perf_lifecycle_transition_duration_seconds_bucket{from_state=\"Proposed\", to_state=\"DeletionProposed\"}[5m])) by (le))", + "legendFormat": "Proposed -> DeletionProposed (p99)", + "refId": "H" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_lifecycle_transition_duration_seconds_sum{from_state=\"Proposed\", to_state=\"DeletionProposed\"}[5m])) / sum(rate(porch_perf_lifecycle_transition_duration_seconds_count{from_state=\"Proposed\", to_state=\"DeletionProposed\"}[5m]))", + "legendFormat": "Proposed -> DeletionProposed (avg)", + "refId": "I" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(porch_perf_lifecycle_transition_duration_seconds_bucket{from_state=\"DeletionProposed\", to_state=\"deleted\"}[5m])) by (le))", + "legendFormat": "DeletionProposed -> deleted (p99)", + "refId": "J" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(porch_perf_lifecycle_transition_duration_seconds_sum{from_state=\"DeletionProposed\", to_state=\"deleted\"}[5m])) / sum(rate(porch_perf_lifecycle_transition_duration_seconds_count{from_state=\"DeletionProposed\", to_state=\"deleted\"}[5m]))", + "legendFormat": "DeletionProposed -> deleted (avg)", + "refId": "K" + } + ], + "yaxes": [ + { + "format": "s", + "label": "Duration", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 35, + "title": "Active Operations", + "type": "graph", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 106 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_active_operations) by (operation)", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "yaxes": [ + { + "format": "short", + "label": "Active Count", + "show": true + }, + { + "format": "short", + "show": true + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [] + }, + "grid": { + "show": true, + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "lines": true, + "linewidth": 1, + "pointradius": 5, + "points": false + }, + { + "id": 36, + "title": "Total Repositories Created", + "type": "stat", + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 114 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "porch_perf_repositories_created_total", + "refId": "A" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "defaults": { + "displayName": "Repositories" + } + } + }, + { + "id": 37, + "title": "Total Packages Created", + "type": "stat", + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 114 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "porch_perf_packages_created_total", + "refId": "A" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + }, + "fieldConfig": { + "defaults": { + "displayName": "Packages" + } + } + }, + { + "id": 38, + "title": "Package Revisions by Status", + "type": "stat", + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 114 + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(porch_perf_package_revisions_total) by (operation, status)", + "legendFormat": "{{operation}} - {{status}}", + "refId": "A" + } + ], + "options": { + "graphMode": "area", + "textMode": "value_and_name" + } + } + ], + "templating": { + "list": [ + { + "name": "namespace", + "type": "query", + "query": "label_values(porch_perf_test_run_info, namespace)", + "current": { + "text": "porch-metrics", + "value": "porch-metrics" + }, + "refresh": 1 + } + ] + }, + "timepicker": { + "refresh_intervals": [ + "1s", + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + } +} \ No newline at end of file diff --git a/deployments/metrics-resources/grafana-porch-server-dashboard.json b/deployments/metrics-resources/grafana-porch-server-dashboard.json new file mode 100644 index 000000000..59b9c5372 --- /dev/null +++ b/deployments/metrics-resources/grafana-porch-server-dashboard.json @@ -0,0 +1,500 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "panels": [], + "title": "PackageRevisions API Calls", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisions GET latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, + "id": 101, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"GET\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"GET\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevision\", verb=\"GET\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevision\", verb=\"GET\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "PR GET", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisions LIST latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, + "id": 102, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"LIST\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"LIST\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevision\", verb=\"LIST\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevision\", verb=\"LIST\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "PR LIST", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisions CREATE latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, + "id": 103, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"CREATE\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"CREATE\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevision\", verb=\"CREATE\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevision\", verb=\"CREATE\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "PR CREATE", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisions UPDATE latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, + "id": 104, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"UPDATE\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"UPDATE\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevision\", verb=\"UPDATE\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevision\", verb=\"UPDATE\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "PR UPDATE", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisions DELETE latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, + "id": 105, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"DELETE\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevision\", verb=\"DELETE\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevision\", verb=\"DELETE\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevision\", verb=\"DELETE\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "PR DELETE", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, + "id": 200, + "panels": [], + "title": "PackageRevisionResources API Calls", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisionResources GET latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 }, + "id": 201, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionResources\", verb=\"GET\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionResources\", verb=\"GET\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevisionResources\", verb=\"GET\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevisionResources\", verb=\"GET\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "PRR GET", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisionResources UPDATE latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 }, + "id": 202, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionResources\", verb=\"UPDATE\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionResources\", verb=\"UPDATE\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevisionResources\", verb=\"UPDATE\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevisionResources\", verb=\"UPDATE\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "PRR UPDATE", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisionResources LIST latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }, + "id": 203, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionResources\", verb=\"LIST\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionResources\", verb=\"LIST\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevisionResources\", verb=\"LIST\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevisionResources\", verb=\"LIST\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "PRR LIST", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 42 }, + "id": 300, + "panels": [], + "title": "Approval API Calls", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisions Approval GET latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 43 }, + "id": 301, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionApproval\", verb=\"GET\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionApproval\", verb=\"GET\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevisionApproval\", verb=\"GET\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevisionApproval\", verb=\"GET\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "Approval GET", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "PackageRevisions Approval UPDATE latency metrics (p95, p99, average).", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 43 }, + "id": 302, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionApproval\", verb=\"UPDATE\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(porch_api_call_duration_seconds_bucket{resource=\"PackageRevisionApproval\", verb=\"UPDATE\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_call_duration_seconds_sum{resource=\"PackageRevisionApproval\", verb=\"UPDATE\"}[5m])) / sum(rate(porch_api_call_duration_seconds_count{resource=\"PackageRevisionApproval\", verb=\"UPDATE\"}[5m]))", + "legendFormat": "avg", + "refId": "C" + } + ], + "title": "Approval UPDATE", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 51 }, + "id": 500, + "panels": [], + "title": "Request Counts by User", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Rate of PackageRevision requests per operation and user.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 52 }, + "id": 501, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_requests_by_user_total{resource=\"PackageRevision\"}[5m])) by (op, user)", + "legendFormat": "{{op}} / {{user}}", + "refId": "A" + } + ], + "title": "PackageRevision Requests/s by Operation & User", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Rate of ExternalRepo requests per operation and user.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 52 }, + "id": 502, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(porch_api_requests_by_user_total{resource=\"ExternalRepo\"}[5m])) by (op, user)", + "legendFormat": "{{op}} / {{user}}", + "refId": "A" + } + ], + "title": "ExternalRepo Requests/s by Operation & User", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Total cumulative request counts broken down by resource, operation, and user.", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisCenteredZero": false, "axisLabel": "", "axisPlacement": "auto", "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 60 }, + "id": 503, + "options": { "legend": { "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(porch_api_requests_by_user_total) by (resource, op, user)", + "legendFormat": "{{resource}} / {{op}} / {{user}}", + "refId": "A" + } + ], + "title": "All Resources — Total Request Count by Resource, Operation & User", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["porch", "api"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Porch API", + "uid": "porch-api-queues", + "version": 1, + "weekStart": "" +} diff --git a/deployments/metrics-resources/grafana-pyroscope-dashboard.json b/deployments/metrics-resources/grafana-pyroscope-dashboard.json new file mode 100644 index 000000000..554951438 --- /dev/null +++ b/deployments/metrics-resources/grafana-pyroscope-dashboard.json @@ -0,0 +1,114 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "grafana-pyroscope-datasource", + "uid": "pyroscope" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 18, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": {}, + "pluginVersion": "12.3.1", + "targets": [ + { + "datasource": { + "type": "grafana-pyroscope-datasource", + "uid": "pyroscope" + }, + "groupBy": [], + "labelSelector": "{service_name=\"$service\"}", + "profileTypeId": "$profiletype", + "queryType": "profile", + "refId": "A", + "spanSelector": [] + } + ], + "title": "Porch – $service · $profiletype", + "type": "flamegraph" + } + ], + "preload": false, + "refresh": "1m", + "schemaVersion": 42, + "tags": ["porch", "pyroscope", "profiling"], + "templating": { + "list": [ + { + "allowCustomValue": false, + "datasource": { + "type": "grafana-pyroscope-datasource", + "uid": "pyroscope" + }, + "definition": "", + "name": "profiletype", + "options": [], + "query": { + "type": "profileType" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "allowCustomValue": false, + "datasource": { + "type": "grafana-pyroscope-datasource", + "uid": "pyroscope" + }, + "definition": "", + "name": "service", + "options": [], + "query": { + "labelName": "service_name", + "profileTypeId": "$profiletype", + "type": "labelValue" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Pyroscope – Porch profiling", + "uid": "pyroscope-porch", + "version": 1, + "weekStart": "" +} diff --git a/deployments/metrics-resources/grafana-resource-usage-dashboard.json b/deployments/metrics-resources/grafana-resource-usage-dashboard.json new file mode 100644 index 000000000..8597c2956 --- /dev/null +++ b/deployments/metrics-resources/grafana-resource-usage-dashboard.json @@ -0,0 +1,912 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "CPU Usage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "CPU usage rate for porch-server container", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisLabel": "CPU cores", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "short", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(process_cpu_seconds_total{job=\"porch-server\"}[5m])", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Porch Server - CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "CPU usage rate for function-runner container", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisLabel": "CPU cores", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "short", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(process_cpu_seconds_total{job=\"function-runner\"}[5m])", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Function Runner - CPU Usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 4, + "panels": [], + "title": "Memory Usage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Memory usage for porch-server container", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisLabel": "Memory", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "bytes", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "process_resident_memory_bytes{job=\"porch-server\"}", + "legendFormat": "resident - {{instance}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "process_virtual_memory_bytes{job=\"porch-server\"}", + "legendFormat": "virtual - {{instance}}", + "refId": "B" + } + ], + "title": "Porch Server - Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Memory usage for function-runner container", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisLabel": "Memory", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "bytes", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 6, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "process_resident_memory_bytes{job=\"function-runner\"}", + "legendFormat": "resident - {{instance}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "process_virtual_memory_bytes{job=\"function-runner\"}", + "legendFormat": "virtual - {{instance}}", + "refId": "B" + } + ], + "title": "Function Runner - Memory Usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 7, + "panels": [], + "title": "Go Runtime Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Number of active goroutines", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisLabel": "Goroutines", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "short", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 19 + }, + "id": 8, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "go_goroutines{job=\"porch-server\"}", + "legendFormat": "porch-server ({{instance}})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "go_goroutines{job=\"function-runner\"}", + "legendFormat": "function-runner ({{instance}})", + "refId": "B" + } + ], + "title": "Goroutines", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Go heap memory allocation", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisLabel": "Memory", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "bytes", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 19 + }, + "id": 9, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "go_memstats_heap_alloc_bytes{job=\"porch-server\"}", + "legendFormat": "porch-server heap alloc ({{instance}})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "go_memstats_heap_inuse_bytes{job=\"porch-server\"}", + "legendFormat": "porch-server heap inuse ({{instance}})", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "go_memstats_heap_alloc_bytes{job=\"function-runner\"}", + "legendFormat": "function-runner heap alloc ({{instance}})", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "go_memstats_heap_inuse_bytes{job=\"function-runner\"}", + "legendFormat": "function-runner heap inuse ({{instance}})", + "refId": "D" + } + ], + "title": "Go Heap Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Garbage collection duration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisLabel": "Duration", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "unit": "s", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 19 + }, + "id": 10, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(go_gc_duration_seconds_sum{job=\"porch-server\"}[5m]) / rate(go_gc_duration_seconds_count{job=\"porch-server\"}[5m])", + "legendFormat": "porch-server avg GC ({{instance}})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(go_gc_duration_seconds_sum{job=\"function-runner\"}[5m]) / rate(go_gc_duration_seconds_count{job=\"function-runner\"}[5m])", + "legendFormat": "function-runner avg GC ({{instance}})", + "refId": "B" + } + ], + "title": "GC Duration (avg)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 11, + "panels": [], + "title": "Resource Stats Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current CPU usage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short", + "decimals": 3 + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 28 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(process_cpu_seconds_total{job=\"porch-server\"}[5m])", + "refId": "A" + } + ], + "title": "Porch Server CPU (cores)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current memory usage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 536870912 + }, + { + "color": "red", + "value": 1073741824 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 28 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "process_resident_memory_bytes{job=\"porch-server\"}", + "refId": "A" + } + ], + "title": "Porch Server Memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current CPU usage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short", + "decimals": 3 + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 28 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(process_cpu_seconds_total{job=\"function-runner\"}[5m])", + "refId": "A" + } + ], + "title": "Function Runner CPU (cores)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current memory usage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 536870912 + }, + { + "color": "red", + "value": 1073741824 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 28 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "process_resident_memory_bytes{job=\"function-runner\"}", + "refId": "A" + } + ], + "title": "Function Runner Memory", + "type": "stat" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["porch", "resources", "kubernetes"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Porch Resource Usage", + "uid": "porch-resource-usage", + "version": 1, + "weekStart": "" +} diff --git a/deployments/metrics-resources/prometheus-config.yaml b/deployments/metrics-resources/prometheus-config.yaml new file mode 100644 index 000000000..d91916b41 --- /dev/null +++ b/deployments/metrics-resources/prometheus-config.yaml @@ -0,0 +1,33 @@ +global: + scrape_interval: 10s + evaluation_interval: 10s + external_labels: + cluster: 'porch' + environment: 'testing' + +scrape_configs: + - job_name: 'porch-server' + scrape_interval: 10s + scrape_timeout: 10s + static_configs: + - targets: ['api.porch-system.svc.cluster.local:9464'] + labels: + service: 'porch-server' + component: 'apiserver' + + - job_name: 'function-runner' + scrape_interval: 10s + scrape_timeout: 10s + static_configs: + - targets: ['function-runner.porch-system.svc.cluster.local:9464'] + labels: + service: 'function-runner' + component: 'grpc-server' + + - job_name: 'porch-performance-tests' + scrape_interval: 5s + scrape_timeout: 5s + static_configs: + - targets: [ '172.17.0.1:9095' ] + labels: + service: 'porch-perf-test' diff --git a/deployments/metrics/Kptfile b/deployments/metrics/Kptfile new file mode 100644 index 000000000..322d9340b --- /dev/null +++ b/deployments/metrics/Kptfile @@ -0,0 +1,20 @@ +apiVersion: kpt.dev/v1 +kind: Kptfile +metadata: + name: porch-monitoring + annotations: + config.kubernetes.io/local-config: "true" +info: + description: Prometheus, Pyroscope and Grafana monitoring stack for Porch +pipeline: + mutators: + - image: apply-setters:v0.2.0 + configMap: + prometheus-image: "docker.io/prom/prometheus:latest" + grafana-image: "docker.io/grafana/grafana:latest" + pyroscope-image: "docker.io/grafana/pyroscope:latest" + prometheus-nodeport: "30091" + grafana-nodeport: "30301" + - image: set-namespace:v0.4.1 + configMap: + namespace: porch-monitoring diff --git a/deployments/metrics/grafana-deployment.yaml b/deployments/metrics/grafana-deployment.yaml new file mode 100644 index 000000000..d944a172f --- /dev/null +++ b/deployments/metrics/grafana-deployment.yaml @@ -0,0 +1,162 @@ +# Copyright 2026 The Nephio Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + namespace: porch-monitoring # kpt-set: ${namespace} + labels: + app: grafana +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: docker.io/grafana/grafana:latest # kpt-set: ${grafana-image} + ports: + - containerPort: 3000 + name: http + env: + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ORG_ROLE + value: "Admin" + - name: GF_AUTH_DISABLE_LOGIN_FORM + value: "true" + - name: GF_USERS_ALLOW_SIGN_UP + value: "false" + - name: GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH + value: "/var/lib/grafana/dashboards/grafana-porch-server-dashboard.json" + - name: GF_DASHBOARDS_MIN_REFRESH_INTERVAL + value: "1s" + - name: GF_SERVER_ENABLE_GZIP + value: "true" + - name: GF_DATABASE_WAL + value: "true" + livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + readinessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + volumeMounts: + - name: grafana-storage + mountPath: /var/lib/grafana + - name: grafana-datasources + mountPath: /etc/grafana/provisioning/datasources + - name: grafana-dashboards-provider + mountPath: /etc/grafana/provisioning/dashboards + - name: grafana-dashboards + mountPath: /var/lib/grafana/dashboards + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "1" + volumes: + - name: grafana-storage + emptyDir: {} + - name: grafana-datasources + configMap: + name: grafana-datasources + - name: grafana-dashboards-provider + configMap: + name: grafana-dashboards-provider + - name: grafana-dashboards + configMap: + name: grafana-dashboards +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards-provider + namespace: porch-monitoring # kpt-set: ${namespace} +data: + dashboards.yaml: | + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources + namespace: porch-monitoring # kpt-set: ${namespace} +data: + prometheus.yaml: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + access: proxy + uid: prometheus + url: http://prometheus:9090 + isDefault: true + editable: true + pyroscope.yaml: | + apiVersion: 1 + datasources: + - name: Pyroscope + type: phlare + access: proxy + uid: pyroscope + url: http://pyroscope:4040 + editable: true + jsonData: + backendType: pyroscope +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: porch-monitoring # kpt-set: ${namespace} + labels: + app: grafana +spec: + type: NodePort + ports: + - port: 3000 + targetPort: 3000 + nodePort: 30301 # kpt-set: ${grafana-nodeport} + name: http + selector: + app: grafana diff --git a/deployments/metrics/prometheus-deployment.yaml b/deployments/metrics/prometheus-deployment.yaml new file mode 100644 index 000000000..d9c318348 --- /dev/null +++ b/deployments/metrics/prometheus-deployment.yaml @@ -0,0 +1,120 @@ +# Copyright 2026 The Nephio Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: porch-monitoring # kpt-set: ${namespace} + labels: + app: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + serviceAccountName: prometheus + containers: + - name: prometheus + image: docker.io/prom/prometheus:latest # kpt-set: ${prometheus-image} + args: + - '--config.file=/etc/prometheus/prometheus-config.yaml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--storage.tsdb.retention.time=6h' + - '--query.timeout=1m' + ports: + - containerPort: 9090 + name: http + volumeMounts: + - name: prometheus-config + mountPath: /etc/prometheus + - name: prometheus-storage + mountPath: /prometheus + resources: + requests: + memory: "512Mi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "2000m" + volumes: + - name: prometheus-config + configMap: + name: prometheus-config + - name: prometheus-storage + emptyDir: {} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: porch-monitoring # kpt-set: ${namespace} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: + - extensions + resources: + - ingresses + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: porch-monitoring # kpt-set: ${namespace} +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: porch-monitoring # kpt-set: ${namespace} + labels: + app: prometheus +spec: + type: NodePort + ports: + - port: 9090 + targetPort: 9090 + nodePort: 30091 # kpt-set: ${prometheus-nodeport} + name: http + selector: + app: prometheus diff --git a/deployments/metrics/pyroscope-deployment.yaml b/deployments/metrics/pyroscope-deployment.yaml new file mode 100644 index 000000000..b0f94fa5a --- /dev/null +++ b/deployments/metrics/pyroscope-deployment.yaml @@ -0,0 +1,65 @@ +# Copyright 2026 The Nephio Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ServiceAccount +apiVersion: v1 +metadata: + name: pyroscope + namespace: porch-monitoring # kpt-set: ${namespace} + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pyroscope + namespace: porch-monitoring # kpt-set: ${namespace} +spec: + replicas: 1 + selector: + matchLabels: + app: pyroscope + template: + metadata: + labels: + app: pyroscope + spec: + serviceAccountName: pyroscope + containers: + - name: pyroscope + image: docker.io/grafana/pyroscope:latest # kpt-set: ${pyroscope-image} + imagePullPolicy: IfNotPresent + ports: + - containerPort: 4040 + name: ingest + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + +--- +apiVersion: v1 +kind: Service +metadata: + name: pyroscope + namespace: porch-monitoring # kpt-set: ${namespace} +spec: + ports: + - port: 4040 + protocol: TCP + targetPort: 4040 + name: ingest + selector: + app: pyroscope diff --git a/deployments/porch/2-function-runner.yaml b/deployments/porch/2-function-runner.yaml index 52b8af6c5..bd0278fa2 100644 --- a/deployments/porch/2-function-runner.yaml +++ b/deployments/porch/2-function-runner.yaml @@ -52,6 +52,13 @@ spec: env: - name: WRAPPER_SERVER_IMAGE value: docker.io/nephio/porch-wrapper-server:latest + # Uncomment to enable memory and cpu profiling with pyroscope + #- name: PYROSCOPE_SERVER + # value: "http://pyroscope.porch-monitoring.svc.cluster.local:4040" + #- name: PYROSCOPE_APP_NAME + # value: porch-function-runner + #- name: PYROSCOPE_LOGS_ENABLED + # value: "false" - name: OTEL_METRICS_EXPORTER value: "prometheus" - name: OTEL_EXPORTER_PROMETHEUS_HOST @@ -90,3 +97,8 @@ spec: - port: 9445 protocol: TCP targetPort: 9445 + name: grpc + - port: 9464 + protocol: TCP + targetPort: 9464 + name: metrics diff --git a/deployments/porch/3-porch-server.yaml b/deployments/porch/3-porch-server.yaml index 28d85cd99..1299a4f15 100644 --- a/deployments/porch/3-porch-server.yaml +++ b/deployments/porch/3-porch-server.yaml @@ -71,6 +71,16 @@ spec: # Uncomment to enable trace-reporting to jaeger #- name: OTEL # value: otel://jaeger-oltp:4317 + # Uncomment to enable memory and cpu profiling with pyroscope + #- name: PYROSCOPE_SERVER + # value: "http://pyroscope.porch-monitoring.svc.cluster.local:4040" + #- name: PYROSCOPE_APP_NAME + # value: porch-server + #- name: PYROSCOPE_LOGS_ENABLED + # value: "false" + # Uncomment to enable memory and cpu profiling with PPROF + #- name: PORCH_PPROF_PORT + # value: "8080" - name: OTEL_SERVICE_NAME value: porch-server - name: CERT_STORAGE_DIR @@ -94,6 +104,10 @@ spec: - --repo-operation-retry-attempts=3 - --max-request-body-size=6291456 # Keep this in sync with function-runner's corresponding argument - --cache-type=db + ports: + - containerPort: 9464 + name: metrics + protocol: TCP #adding livenessProbes and readinessProbes for porch server livenessProbe: httpGet: @@ -131,5 +145,9 @@ spec: protocol: TCP targetPort: 8443 name: webhooks + - port: 9464 + protocol: TCP + targetPort: 9464 + name: metrics selector: app: porch-server diff --git a/func/server/server.go b/func/server/server.go index c6b0cf056..454868b2f 100644 --- a/func/server/server.go +++ b/func/server/server.go @@ -30,6 +30,7 @@ import ( pb "github.com/nephio-project/porch/func/evaluator" "github.com/nephio-project/porch/func/healthchecker" "github.com/nephio-project/porch/func/internal" + "github.com/nephio-project/porch/internal/metrics" porchotel "github.com/nephio-project/porch/internal/otel" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" @@ -104,6 +105,10 @@ func main() { func run(o *options) error { ctx := contextsignal.SetupSignalContext() + pyro := &metrics.PyroscopeProfiling{} + pyro.Start() + defer pyro.Stop() + flagSet := flag.NewFlagSet("log-level", flag.ContinueOnError) klog.InitFlags(flagSet) _ = flagSet.Parse([]string{"--v", strconv.Itoa(o.logLevel)}) @@ -120,12 +125,17 @@ func run(o *options) error { lis.Close() }() - err = porchotel.SetupOpenTelemetry(ctx) + otelResources, err := porchotel.SetupOpenTelemetry(ctx) if err != nil { contextsignal.RequestShutdown() klog.Errorf("%v\n", err) return err } + defer func() { + if err := otelResources.ShutdownWithTimeout(10 * time.Second); err != nil { + klog.Warningf("failed to gracefully shutdown OpenTelemetry: %v", err) + } + }() availableRuntimes := map[string]struct{}{ execRuntime: {}, diff --git a/func/wrapper-server/main.go b/func/wrapper-server/main.go index 9ec2713a6..185fa70b4 100644 --- a/func/wrapper-server/main.go +++ b/func/wrapper-server/main.go @@ -25,6 +25,7 @@ import ( "os" "os/exec" "strconv" + "time" "github.com/kptdev/krm-functions-sdk/go/fn" pb "github.com/nephio-project/porch/func/evaluator" @@ -79,13 +80,17 @@ type options struct { func (o *options) run() error { ctx := contextsignal.SetupSignalContext() - err := porchotel.SetupOpenTelemetry(ctx) + otelResources, err := porchotel.SetupOpenTelemetry(ctx) if err != nil { contextsignal.RequestShutdown() klog.Errorf("%v\n", err) return err } - klog.Info("OpenTelemetry initialized") + defer func() { + if err := otelResources.ShutdownWithTimeout(10 * time.Second); err != nil { + klog.Warningf("failed to gracefully shutdown OpenTelemetry: %v", err) + } + }() address := fmt.Sprintf(":%d", o.port) lis, err := net.Listen("tcp", address) if err != nil { diff --git a/go.mod b/go.mod index d1f4e725d..231008bc6 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,9 @@ require ( github.com/google/go-cmp v0.7.0 github.com/google/go-containerregistry v0.20.6 github.com/google/uuid v1.6.0 + github.com/grafana/pyroscope-go v1.2.8 github.com/jackc/pgx/v5 v5.9.2 + github.com/joho/godotenv v1.5.1 github.com/kptdev/kpt v1.0.0-beta.62.1 github.com/kptdev/krm-functions-catalog/functions/go/apply-replacements v0.1.5 github.com/kptdev/krm-functions-catalog/functions/go/apply-setters v0.2.2 @@ -36,6 +38,8 @@ require ( go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 go.opentelemetry.io/contrib/propagators/autoprop v0.63.0 go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/prometheus v0.65.0 + go.opentelemetry.io/otel/metric v1.43.0 go.opentelemetry.io/otel/sdk v1.43.0 go.opentelemetry.io/otel/sdk/metric v1.43.0 go.opentelemetry.io/otel/trace v1.43.0 @@ -97,7 +101,6 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 // indirect - go.opentelemetry.io/otel/exporters/prometheus v0.65.0 // indirect go.opentelemetry.io/otel/exporters/stdout/stdoutlog v0.19.0 // indirect go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0 // indirect go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 // indirect @@ -169,6 +172,7 @@ require ( github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect github.com/googleapis/gax-go/v2 v2.15.0 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect + github.com/grafana/pyroscope-go/godeltaprof v0.1.9 // indirect github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect @@ -221,7 +225,6 @@ require ( go.mongodb.org/mongo-driver v1.17.6 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect - go.opentelemetry.io/otel/metric v1.43.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect go.yaml.in/yaml/v2 v2.4.4 // indirect diff --git a/go.sum b/go.sum index 0e035214e..3acd0032c 100644 --- a/go.sum +++ b/go.sum @@ -224,6 +224,10 @@ github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81 github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= +github.com/grafana/pyroscope-go v1.2.8 h1:UvCwIhlx9DeV7F6TW/z8q1Mi4PIm3vuUJ2ZlCEvmA4M= +github.com/grafana/pyroscope-go v1.2.8/go.mod h1:SSi59eQ1/zmKoY/BKwa5rSFsJaq+242Bcrr4wPix1g8= +github.com/grafana/pyroscope-go/godeltaprof v0.1.9 h1:c1Us8i6eSmkW+Ez05d3co8kasnuOY813tbMN8i/a3Og= +github.com/grafana/pyroscope-go/godeltaprof v0.1.9/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1 h1:qnpSQwGEnkcRpTqNOIR6bJbR0gAorgP9CSALpRcKoAA= @@ -246,6 +250,8 @@ github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo= +github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= +github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/jonboulle/clockwork v0.5.0 h1:Hyh9A8u51kptdkR+cqRpT1EebBwTn1oK9YfGYbdFz6I= github.com/jonboulle/clockwork v0.5.0/go.mod h1:3mZlmanh0g2NDKO5TWZVJAfofYk64M7XN3SzBPjZF60= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go new file mode 100644 index 000000000..b7be88a1f --- /dev/null +++ b/internal/metrics/metrics.go @@ -0,0 +1,236 @@ +// Copyright 2026 The kpt and Nephio Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "context" + "fmt" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "k8s.io/apiserver/pkg/endpoints/request" +) + +const meterName = "github.com/nephio-project/porch" + +var ( + apiCallDurationSeconds metric.Float64Histogram + + operationDuration metric.Float64Histogram + operationCounter metric.Float64Counter + repositoryCounter metric.Float64Counter + packageCounter metric.Float64Counter + packageRevisionCounter metric.Float64Counter + lifecycleTransitionDuration metric.Float64Histogram + testRunInfoGauge metric.Float64Gauge + activeOperations metric.Float64UpDownCounter + + RequestsTotal metric.Float64Counter +) + +func InitMetrics() { + m := otel.Meter(meterName) + var err error + + apiCallDurationSeconds, err = m.Float64Histogram( + "porch_api_call_duration_seconds", + metric.WithDescription("Duration of porch API calls in seconds."), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries( + 0.001, 0.002, 0.004, 0.008, 0.016, 0.032, 0.064, 0.128, + 0.256, 0.512, 1.024, 2.048, 4.096, 8.192, 16.384, 32.768, + ), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_api_call_duration_seconds: %v", err)) + } + + operationDuration, err = m.Float64Histogram( + "porch_perf_operation_duration_seconds", + metric.WithDescription("Duration of Porch performance test operations in seconds"), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 30, 60, 120), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_perf_operation_duration_seconds: %v", err)) + } + + operationCounter, err = m.Float64Counter( + "porch_perf_operations_total", + metric.WithDescription("Total number of Porch performance test operations"), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_perf_operations_total: %v", err)) + } + + repositoryCounter, err = m.Float64Counter( + "porch_perf_repositories_created_total", + metric.WithDescription("Total number of repositories created in performance tests"), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_perf_repositories_created_total: %v", err)) + } + + packageCounter, err = m.Float64Counter( + "porch_perf_packages_created_total", + metric.WithDescription("Total number of packages created in performance tests"), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_perf_packages_created_total: %v", err)) + } + + packageRevisionCounter, err = m.Float64Counter( + "porch_perf_package_revisions_total", + metric.WithDescription("Total number of package revisions created in performance tests"), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_perf_package_revisions_total: %v", err)) + } + + lifecycleTransitionDuration, err = m.Float64Histogram( + "porch_perf_lifecycle_transition_duration_seconds", + metric.WithDescription("Duration of package lifecycle transitions in seconds"), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(0.1, 0.5, 1, 2, 5, 10, 30, 60), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_perf_lifecycle_transition_duration_seconds: %v", err)) + } + + testRunInfoGauge, err = m.Float64Gauge( + "porch_perf_test_run_info", + metric.WithDescription("Information about the current performance test run"), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_perf_test_run_info: %v", err)) + } + + activeOperations, err = m.Float64UpDownCounter( + "porch_perf_active_operations", + metric.WithDescription("Number of currently active operations"), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_perf_active_operations: %v", err)) + } + + RequestsTotal, err = m.Float64Counter( + "porch_api_requests_by_user", + metric.WithDescription("Total number of requests tracked by BurstCounter, broken down by resource, operation, and user."), + ) + if err != nil { + panic(fmt.Sprintf("failed to create porch_api_requests_by_user: %v", err)) + } +} + +// Porch server and function runner metric recording functions +func RecordAPICallDuration(resource, verb string, durationSeconds float64) { + if apiCallDurationSeconds == nil { + return + } + apiCallDurationSeconds.Record(context.Background(), durationSeconds, + metric.WithAttributes( + attribute.String("resource", resource), + attribute.String("verb", verb), + ), + ) +} + +func RecordRequestCount(ctx context.Context, resource, op string) { + if RequestsTotal == nil { + return + } + user := getK8sUserName(ctx) + RequestsTotal.Add(context.Background(), 1, + metric.WithAttributes( + attribute.String("resource", resource), + attribute.String("op", op), + attribute.String("user", user), + ), + ) +} + +// Performance test metric recording functions +func PerfTestRecordMetric(operation, repoName, pkgName string, duration time.Duration, err error) { + attrs := metric.WithAttributes( + attribute.String("operation", operation), + attribute.String("repository", repoName), + attribute.String("package", pkgName), + attribute.String("status", statusLabel(err)), + ) + ctx := context.Background() + operationDuration.Record(ctx, duration.Seconds(), attrs) + operationCounter.Add(ctx, 1, attrs) +} + +func PerfTestRecordLifecycleTransition(fromState, toState, repoName, pkgName string, duration time.Duration, err error) { + lifecycleTransitionDuration.Record(context.Background(), duration.Seconds(), + metric.WithAttributes( + attribute.String("from_state", fromState), + attribute.String("to_state", toState), + attribute.String("repository", repoName), + attribute.String("package", pkgName), + attribute.String("status", statusLabel(err)), + ), + ) +} + +func PerfTestRecordPackageRevision(operation string, err error) { + packageRevisionCounter.Add(context.Background(), 1, + metric.WithAttributes( + attribute.String("operation", operation), + attribute.String("status", statusLabel(err)), + ), + ) +} + +func PerfTestSetTestRunInfo(testName, namespace string, startTime time.Time) { + testRunInfoGauge.Record(context.Background(), 1, + metric.WithAttributes( + attribute.String("test_name", testName), + attribute.String("namespace", namespace), + attribute.String("start_time", startTime.Format(time.RFC3339)), + ), + ) +} + +func PerfTestRecordActiveOperation(operation string, delta float64) { + activeOperations.Add(context.Background(), delta, + metric.WithAttributes(attribute.String("operation", operation)), + ) +} + +func PerfTestIncrementRepositoryCounter() { + repositoryCounter.Add(context.Background(), 1) +} + +func PerfTestIncrementPackageCounter() { + packageCounter.Add(context.Background(), 1) +} + +func statusLabel(err error) string { + if err != nil { + return "error" + } + return "success" +} + +func getK8sUserName(ctx context.Context) string { + if user, ok := request.UserFrom(ctx); ok { + return user.GetName() + } + return "" +} diff --git a/internal/metrics/pprof.go b/internal/metrics/pprof.go new file mode 100644 index 000000000..401eb0eba --- /dev/null +++ b/internal/metrics/pprof.go @@ -0,0 +1,92 @@ +// Copyright 2025 The Nephio Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "context" + "fmt" + "net/http" + httppprof "net/http/pprof" + "os" + "runtime" + "strconv" + "time" + + "k8s.io/klog/v2" +) + +const PProfPortEnvVar = "PORCH_PPROF_PORT" + +type Profiling struct { + port int + server *http.Server +} + +func (p *Profiling) Start() { + if envport := os.Getenv(PProfPortEnvVar); envport != "" { + parsed, err := strconv.Atoi(envport) + if err != nil { + klog.Warningf("Failed to parse %s environment variable (%s) as int: %v", PProfPortEnvVar, envport, err) + return + } + p.port = parsed + } else { + return + } + + // Enable profiling for mutex and block profiles + runtime.SetMutexProfileFraction(1) + runtime.SetBlockProfileRate(1) + + mux := http.NewServeMux() + + mux.HandleFunc("/debug/pprof/", httppprof.Index) + mux.HandleFunc("/debug/pprof/cmdline", httppprof.Cmdline) + mux.HandleFunc("/debug/pprof/profile", httppprof.Profile) + mux.HandleFunc("/debug/pprof/symbol", httppprof.Symbol) + mux.HandleFunc("/debug/pprof/trace", httppprof.Trace) + mux.Handle("/debug/pprof/heap", httppprof.Handler("heap")) + mux.Handle("/debug/pprof/goroutine", httppprof.Handler("goroutine")) + mux.Handle("/debug/pprof/threadcreate", httppprof.Handler("threadcreate")) + mux.Handle("/debug/pprof/block", httppprof.Handler("block")) + mux.Handle("/debug/pprof/mutex", httppprof.Handler("mutex")) + mux.Handle("/debug/pprof/allocs", httppprof.Handler("allocs")) + p.server = &http.Server{ + Addr: fmt.Sprintf(":%d", p.port), + Handler: mux, + ReadHeaderTimeout: 10 * time.Second, + } + go p.serve() +} + +func (p *Profiling) Stop() { + if p.server != nil { + klog.Infof("Shutting down profiling server") + ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(3*time.Second)) + defer cancel() + err := p.server.Shutdown(ctx) + if err != nil { + klog.Errorf("Error shutting down profiling server: %v", err) + } + } +} + +func (p *Profiling) serve() { + klog.Infof("Starting profiling server on port :%d", p.port) + if err := p.server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + klog.Errorf("Error starting metrics server: %v", err) + } + klog.Info("Profiling server stopped") +} diff --git a/internal/metrics/pyroscope.go b/internal/metrics/pyroscope.go new file mode 100644 index 000000000..5279be852 --- /dev/null +++ b/internal/metrics/pyroscope.go @@ -0,0 +1,103 @@ +// Copyright 2026 The Nephio Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "os" + "runtime" + + "github.com/grafana/pyroscope-go" + "k8s.io/klog/v2" +) + +const ( + PyroscopeServerEnvVar = "PYROSCOPE_SERVER" + PyroscopeAppNameEnvVar = "PYROSCOPE_APP_NAME" + PyroscopeAuthUserVar = "PYROSCOPE_AUTH_USER" + PyroscopeAuthPassVar = "PYROSCOPE_AUTH_PASSWORD" // #nosec G101 -- env var name, not a credential (nolint:gosec) + PyroscopeLogsEnabledEnvVar = "PYROSCOPE_LOGS_ENABLED" +) + +const defaultPyroscopeAppName = "porch-server" + +type PyroscopeProfiling struct { + stop func() error +} + +func (p *PyroscopeProfiling) Start() { + serverURL := os.Getenv(PyroscopeServerEnvVar) + if serverURL == "" { + return + } + + appName := os.Getenv(PyroscopeAppNameEnvVar) + if appName == "" { + appName = defaultPyroscopeAppName + } + + runtime.SetMutexProfileFraction(1) + runtime.SetBlockProfileRate(1) + + var logger pyroscope.Logger + logsEnabled := os.Getenv(PyroscopeLogsEnabledEnvVar) + if logsEnabled == "true" || logsEnabled == "1" { + logger = pyroscope.StandardLogger + } + + cfg := pyroscope.Config{ + ApplicationName: appName, + ServerAddress: serverURL, + Logger: logger, + Tags: map[string]string{ + "service_name": appName, + }, + ProfileTypes: []pyroscope.ProfileType{ + pyroscope.ProfileCPU, + pyroscope.ProfileAllocObjects, + pyroscope.ProfileAllocSpace, + pyroscope.ProfileInuseObjects, + pyroscope.ProfileInuseSpace, + pyroscope.ProfileGoroutines, + pyroscope.ProfileMutexCount, + pyroscope.ProfileMutexDuration, + pyroscope.ProfileBlockCount, + pyroscope.ProfileBlockDuration, + }, + } + if user := os.Getenv(PyroscopeAuthUserVar); user != "" { + cfg.BasicAuthUser = user + } + if password := os.Getenv(PyroscopeAuthPassVar); password != "" { + cfg.BasicAuthPassword = password + } + + profiler, err := pyroscope.Start(cfg) + if err != nil { + klog.Warningf("Failed to start Pyroscope profiling: %v", err) + return + } + p.stop = profiler.Stop + klog.Infof("Pyroscope continuous profiling started (server=%q, app=%q)", serverURL, appName) +} + +func (p *PyroscopeProfiling) Stop() { + if p.stop != nil { + klog.Infof("Stopping Pyroscope profiler") + if err := p.stop(); err != nil { + klog.Warningf("Pyroscope profiler stop: %v", err) + } + p.stop = nil + } +} diff --git a/internal/otel/otel.go b/internal/otel/otel.go index c69d1c48b..d43969e61 100644 --- a/internal/otel/otel.go +++ b/internal/otel/otel.go @@ -18,77 +18,203 @@ import ( "context" "fmt" "net/http" + "os" + "strconv" "time" - "go.opentelemetry.io/contrib/bridges/prometheus" + "github.com/nephio-project/porch/internal/metrics" + prombridge "go.opentelemetry.io/contrib/bridges/prometheus" "go.opentelemetry.io/contrib/exporters/autoexport" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/contrib/propagators/autoprop" "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/sdk/metric" + otelprometheus "go.opentelemetry.io/otel/exporters/prometheus" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/trace" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "k8s.io/klog/v2" controllerruntimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics" ) -// Sets up OpenTelemetry with parameters -// from environment variables based on the -// opentelemetry.io/contrib/exporters/autoexport" -func SetupOpenTelemetry(ctx context.Context) error { +const ( + otelPortEnv = "OTEL_EXPORTER_PROMETHEUS_PORT" +) + +// OTelResources holds all OpenTelemetry resources that need lifecycle management. +// Use Shutdown() to cleanly release all resources. +type OTelResources struct { + metricsServer *http.Server + metricsPort int + meterProvider *sdkmetric.MeterProvider + tracerProvider *trace.TracerProvider + metricReader sdkmetric.Reader +} + +// Shutdown gracefully shuts down all OpenTelemetry resources. +func (r *OTelResources) Shutdown(ctx context.Context) error { + var errs []error + if r.metricsServer != nil { + if err := r.metricsServer.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("metrics server shutdown: %w", err)) + } + } + if r.metricReader != nil { + if err := r.metricReader.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("metric reader shutdown: %w", err)) + } + } + if r.meterProvider != nil { + if err := r.meterProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("meter provider shutdown: %w", err)) + } + } + if r.tracerProvider != nil { + if err := r.tracerProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("tracer provider shutdown: %w", err)) + } + } + if len(errs) > 0 { + return fmt.Errorf("otel shutdown errors: %v", errs) + } + return nil +} + +// ShutdownWithTimeout is a convenience wrapper around Shutdown with a timeout. +func (r *OTelResources) ShutdownWithTimeout(timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + return r.Shutdown(ctx) +} + +// Flush forces a flush of the meter provider, useful in tests. +func (r *OTelResources) Flush() error { + if r.meterProvider != nil { + return r.meterProvider.ForceFlush(context.Background()) + } + return nil +} + +// SetupOpenTelemetry is the single entry point for all OpenTelemetry setup. +// It configures tracing, metrics (including the Prometheus HTTP server if +// OTEL_EXPORTER_PROMETHEUS_PORT is set), and initializes all Porch metric +// instruments. Returns OTelResources for lifecycle management. +func SetupOpenTelemetry(ctx context.Context) (*OTelResources, error) { setupTiming := time.Now() - err := setupTracing(ctx) - if err != nil { - return err + res := &OTelResources{} + + // Setup tracing + if err := setupTracing(ctx, res); err != nil { + return nil, err } - err = setupMetrics(ctx) - if err != nil { - return err + + // Setup metrics provider + if err := setupMetrics(ctx, res); err != nil { + return nil, err } + + // Initialize all Porch metric instruments + metrics.InitMetrics() + + // Start the Prometheus metrics HTTP server if port is configured + if err := startMetricsServerIfConfigured(res); err != nil { + return nil, err + } + http.DefaultTransport = otelhttp.NewTransport(http.DefaultTransport) http.DefaultClient.Transport = http.DefaultTransport klog.Infof("OpenTelemetry initialized in %s", time.Since(setupTiming)) - return nil - + return res, nil } -func setupTracing(ctx context.Context) error { +func setupTracing(ctx context.Context, res *OTelResources) error { exp, err := autoexport.NewSpanExporter(ctx) if err != nil { return fmt.Errorf("failed to create span exporter: %w", err) } tp := trace.NewTracerProvider(trace.WithBatcher(exp)) - go func() { - <-ctx.Done() - if err := tp.Shutdown(context.Background()); err != nil { - panic(err) - } - }() + res.tracerProvider = tp otel.SetTracerProvider(tp) otel.SetTextMapPropagator(autoprop.NewTextMapPropagator()) - return nil } -func setupMetrics(ctx context.Context) error { - autoexport.WithFallbackMetricProducer(func(ctx context.Context) (metric.Producer, error) { - return prometheus.NewMetricProducer( - prometheus.WithGatherer(controllerruntimemetrics.Registry), +func setupMetrics(ctx context.Context, res *OTelResources) error { + exporter := os.Getenv("OTEL_METRICS_EXPORTER") + + autoexport.WithFallbackMetricProducer(func(ctx context.Context) (sdkmetric.Producer, error) { + return prombridge.NewMetricProducer( + prombridge.WithGatherer(prometheus.Gatherers{ + prometheus.DefaultGatherer, + controllerruntimemetrics.Registry, + }), ), nil }) - mr, err := autoexport.NewMetricReader(ctx) + promExp, err := otelprometheus.New( + otelprometheus.WithRegisterer(prometheus.DefaultRegisterer), + ) if err != nil { - return fmt.Errorf("failed to create metric reader: %w", err) + return fmt.Errorf("failed to create prometheus exporter: %w", err) } - go func() { - <-ctx.Done() - if err := mr.Shutdown(context.Background()); err != nil { - panic(err) + + readers := []sdkmetric.Option{sdkmetric.WithReader(promExp)} + + if exporter != "prometheus" { + autoMr, err := autoexport.NewMetricReader(ctx) + if err != nil { + return fmt.Errorf("failed to create metric reader: %w", err) } - }() + res.metricReader = autoMr + readers = append(readers, sdkmetric.WithReader(autoMr)) + } - mp := metric.NewMeterProvider(metric.WithReader(mr)) + mp := sdkmetric.NewMeterProvider(readers...) + res.meterProvider = mp otel.SetMeterProvider(mp) return nil } + +func startMetricsServerIfConfigured(res *OTelResources) error { + portStr := os.Getenv(otelPortEnv) + if portStr == "" { + return nil + } + port, err := strconv.Atoi(portStr) + if err != nil { + return fmt.Errorf("invalid %s value %q: %w", otelPortEnv, portStr, err) + } + if port <= 0 { + return nil + } + + gatherers := prometheus.Gatherers{ + prometheus.DefaultGatherer, + controllerruntimemetrics.Registry, + } + handler := promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{ + ErrorHandling: promhttp.ContinueOnError, + }) + + mux := http.NewServeMux() + mux.Handle("/metrics", handler) + + srv := &http.Server{ + Addr: fmt.Sprintf(":%d", port), + Handler: mux, + ReadHeaderTimeout: 10 * time.Second, + } + res.metricsServer = srv + res.metricsPort = port + + go func() { + if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { + klog.Errorf("OTel metrics server error: %v", err) + } + }() + klog.Infof("OTel metrics server started on port %d", port) + + return nil +} diff --git a/internal/otel/otel_test.go b/internal/otel/otel_test.go index 91e804911..a5ab401b3 100644 --- a/internal/otel/otel_test.go +++ b/internal/otel/otel_test.go @@ -22,6 +22,7 @@ import ( "net/http" "net/http/httptest" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -47,10 +48,11 @@ func TestOtelMetricsPushHTTP(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - err := SetupOpenTelemetry(ctx) + res, err := SetupOpenTelemetry(ctx) require.NoError(t, err) - cancel() + // Shutdown flushes the periodic reader, which triggers the export + res.ShutdownWithTimeout(5 * time.Second) <-requestWaitChannel } @@ -68,7 +70,7 @@ func TestOtelTracesPushHTTP(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - err := SetupOpenTelemetry(ctx) + res, err := SetupOpenTelemetry(ctx) require.NoError(t, err) // Create a span to trigger trace export @@ -76,36 +78,26 @@ func TestOtelTracesPushHTTP(t *testing.T) { _, span := tracer.Start(ctx, "test-span") span.End() + // Shutdown flushes the batch span processor + res.ShutdownWithTimeout(5 * time.Second) <-requestWaitChannel } func TestSetupOpenTelemetryPrometheusEndpoint(t *testing.T) { - // Find available port - listener, err := net.Listen("tcp", ":0") - require.NoError(t, err) - port := listener.Addr().(*net.TCPAddr).Port - listener.Close() - t.Setenv("OTEL_METRICS_EXPORTER", "prometheus") - t.Setenv("OTEL_EXPORTER_PROMETHEUS_HOST", "localhost") - t.Setenv("OTEL_EXPORTER_PROMETHEUS_PORT", fmt.Sprintf("%d", port)) + t.Setenv("OTEL_TRACES_EXPORTER", "none") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - err = SetupOpenTelemetry(ctx) + res, err := SetupOpenTelemetry(ctx) require.NoError(t, err) + defer res.ShutdownWithTimeout(5 * time.Second) - // Make request to the Prometheus metrics endpoint - resp, err := http.Get(fmt.Sprintf("http://localhost:%d/metrics", port)) + // Verify that metrics are accessible via the OTel meter provider + meter := otel.Meter("test") + counter, err := meter.Float64Counter("test_counter") require.NoError(t, err) - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - require.NoError(t, err) - - metricsText := string(body) - // Verify at least one metric line exists (non-comment, non-empty) - assert.Regexp(t, `(?m)^([a-zA-Z_][a-zA-Z0-9_]*)`, metricsText) + counter.Add(ctx, 1) } func TestOtelMetricsPushGRPC(t *testing.T) { @@ -133,10 +125,10 @@ func TestOtelMetricsPushGRPC(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - err = SetupOpenTelemetry(ctx) + res, err := SetupOpenTelemetry(ctx) require.NoError(t, err) - cancel() + res.ShutdownWithTimeout(5 * time.Second) <-requestWaitChannel } @@ -165,7 +157,7 @@ func TestOtelTracesPushGRPC(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - err = SetupOpenTelemetry(ctx) + res, err := SetupOpenTelemetry(ctx) require.NoError(t, err) // Create a span to trigger trace export @@ -173,7 +165,7 @@ func TestOtelTracesPushGRPC(t *testing.T) { _, span := tracer.Start(ctx, "test-span") span.End() - cancel() + res.ShutdownWithTimeout(5 * time.Second) <-requestWaitChannel } diff --git a/pkg/externalrepo/git/git.go b/pkg/externalrepo/git/git.go index f8963ad30..61918e021 100644 --- a/pkg/externalrepo/git/git.go +++ b/pkg/externalrepo/git/git.go @@ -37,6 +37,7 @@ import ( kptfilev1 "github.com/kptdev/kpt/pkg/api/kptfile/v1" porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" configapi "github.com/nephio-project/porch/api/porchconfig/v1alpha1" + "github.com/nephio-project/porch/internal/metrics" "github.com/nephio-project/porch/pkg/errors" externalrepotypes "github.com/nephio-project/porch/pkg/externalrepo/types" "github.com/nephio-project/porch/pkg/repository" @@ -60,6 +61,7 @@ const ( // Retry delay constants baseRetryDelay = 200 * time.Millisecond hookRetryDelay = 1 * time.Second + metricsName = "ExternalRepo" ) // Retryable error patterns for git push operations @@ -1129,6 +1131,9 @@ func (r *gitRepository) GetRepo() (string, error) { func (r *gitRepository) fetchRemoteRepository(ctx context.Context) error { ctx, span := tracer.Start(ctx, "gitRepository::fetchRemoteRepository", trace.WithAttributes()) defer span.End() + + metrics.RecordRequestCount(ctx, metricsName, "FETCH") + start := time.Now() defer func() { klog.V(2).Infof("Fetching repository %q took %s", r.key.Name, time.Since(start)) }() diff --git a/pkg/registry/porch/packagecommon.go b/pkg/registry/porch/packagecommon.go index 1fef9e705..ac28a66c7 100644 --- a/pkg/registry/porch/packagecommon.go +++ b/pkg/registry/porch/packagecommon.go @@ -172,7 +172,7 @@ func (v *v1alpha2FilteringWatcher) OnPackageRevisionChange(eventType watch.Event } func (r *packageCommon) watchPackages(ctx context.Context, filter repository.ListPackageRevisionFilter, callback engine.ObjectWatcher) error { - var watcher engine.ObjectWatcher = callback + var watcher = callback if ns, namespaced := genericapirequest.NamespaceFrom(ctx); namespaced && ns != "" { watcher = &namespaceFilteringWatcher{ns: ns, delegate: watcher} diff --git a/pkg/registry/porch/packagerevision.go b/pkg/registry/porch/packagerevision.go index 943d3f27b..bd0d113b1 100644 --- a/pkg/registry/porch/packagerevision.go +++ b/pkg/registry/porch/packagerevision.go @@ -17,8 +17,10 @@ package porch import ( "context" "fmt" + "time" porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" + "github.com/nephio-project/porch/internal/metrics" "github.com/nephio-project/porch/pkg/repository" context1 "github.com/nephio-project/porch/pkg/util/context" "go.opentelemetry.io/otel" @@ -33,6 +35,8 @@ import ( "k8s.io/klog/v2" ) +const PRMetricsName = "PackageRevision" + var tracer = otel.Tracer("packagerevision") type packageRevisions struct { @@ -72,7 +76,13 @@ func (r *packageRevisions) NamespaceScoped() bool { // List selects resources in the storage which match to the selector. 'options' can be nil. func (r *packageRevisions) List(ctx context.Context, options *metainternalversion.ListOptions) (runtime.Object, error) { ctx, span := tracer.Start(ctx, "[START]::packageRevisions::List", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration(PRMetricsName, "LIST", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRMetricsName, "LIST") ctx = context1.WithNewRequestID(ctx) @@ -114,7 +124,13 @@ func (r *packageRevisions) List(ctx context.Context, options *metainternalversio // Get implements the Getter interface func (r *packageRevisions) Get(ctx context.Context, name string, _ *metav1.GetOptions) (runtime.Object, error) { ctx, span := tracer.Start(ctx, "[START]::packageRevisions::Get", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration(PRMetricsName, "GET", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRMetricsName, "GET") ctx = context1.WithNewRequestIDAndPackageRevision(ctx, name) @@ -139,7 +155,13 @@ func (r *packageRevisions) Get(ctx context.Context, name string, _ *metav1.GetOp func (r *packageRevisions) Create(ctx context.Context, runtimeObject runtime.Object, _ rest.ValidateObjectFunc, _ *metav1.CreateOptions) (runtime.Object, error) { ctx, span := tracer.Start(ctx, "[START]::packageRevisions::Create", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration(PRMetricsName, "CREATE", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRMetricsName, "CREATE") ctx = context1.WithNewRequestID(ctx) @@ -280,7 +302,13 @@ func createAction(pkgRev *porchapi.PackageRevision) string { func (r *packageRevisions) Update(ctx context.Context, name string, objInfo rest.UpdatedObjectInfo, createValidation rest.ValidateObjectFunc, updateValidation rest.ValidateObjectUpdateFunc, forceAllowCreate bool, _ *metav1.UpdateOptions) (runtime.Object, bool, error) { ctx, span := tracer.Start(ctx, "[START]::packageRevisions::Update", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration(PRMetricsName, "UPDATE", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRMetricsName, "UPDATE") ctx = context1.WithNewRequestIDAndPackageRevision(ctx, name) @@ -304,7 +332,13 @@ func (r *packageRevisions) Update(ctx context.Context, name string, objInfo rest // deleted or false if it will be deleted asynchronously. func (r *packageRevisions) Delete(ctx context.Context, name string, deleteValidation rest.ValidateObjectFunc, _ *metav1.DeleteOptions) (runtime.Object, bool, error) { ctx, span := tracer.Start(ctx, "[START]::packageRevisions::Delete", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration(PRMetricsName, "DELETE", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRMetricsName, "DELETE") ctx = context1.WithNewRequestIDAndPackageRevision(ctx, name) diff --git a/pkg/registry/porch/packagerevision_approval.go b/pkg/registry/porch/packagerevision_approval.go index 35a3091e9..d286f65aa 100644 --- a/pkg/registry/porch/packagerevision_approval.go +++ b/pkg/registry/porch/packagerevision_approval.go @@ -18,8 +18,10 @@ import ( "context" "fmt" "strings" + "time" porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" + "github.com/nephio-project/porch/internal/metrics" context1 "github.com/nephio-project/porch/pkg/util/context" "go.opentelemetry.io/otel/trace" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -29,6 +31,8 @@ import ( "k8s.io/klog/v2" ) +const PRAMetricsName = "PackageRevisionApproval" + type packageRevisionApproval struct { packageCommon } @@ -53,7 +57,13 @@ func (a *packageRevisionApproval) NamespaceScoped() bool { func (a *packageRevisionApproval) Get(ctx context.Context, name string, _ *metav1.GetOptions) (runtime.Object, error) { ctx, span := tracer.Start(ctx, "[START]::packageRevisionApproval::Get", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration("PackageRevisionApproval", "GET", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRAMetricsName, "GET") ctx = context1.WithNewRequestIDAndPackageRevision(ctx, name) @@ -70,7 +80,13 @@ func (a *packageRevisionApproval) Get(ctx context.Context, name string, _ *metav func (a *packageRevisionApproval) Update(ctx context.Context, name string, objInfo rest.UpdatedObjectInfo, createValidation rest.ValidateObjectFunc, updateValidation rest.ValidateObjectUpdateFunc, _ bool, _ *metav1.UpdateOptions) (runtime.Object, bool, error) { ctx, span := tracer.Start(ctx, "[START]::packageRevisionApproval::Update", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration("PackageRevisionApproval", "UPDATE", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRAMetricsName, "GET") ctx = context1.WithNewRequestIDAndPackageRevision(ctx, name) diff --git a/pkg/registry/porch/packagerevisionresources.go b/pkg/registry/porch/packagerevisionresources.go index ff9b8d0eb..142ccf7fe 100644 --- a/pkg/registry/porch/packagerevisionresources.go +++ b/pkg/registry/porch/packagerevisionresources.go @@ -17,10 +17,12 @@ package porch import ( "context" "fmt" + "time" porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" porchv1alpha2 "github.com/nephio-project/porch/api/porch/v1alpha2" "github.com/nephio-project/porch/api/porchconfig/v1alpha1" + "github.com/nephio-project/porch/internal/metrics" "github.com/nephio-project/porch/pkg/repository" context1 "github.com/nephio-project/porch/pkg/util/context" "go.opentelemetry.io/otel/trace" @@ -35,6 +37,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +const PRRMetricsName = "PackageRevisionResources" + type packageRevisionResources struct { rest.TableConvertor packageCommon @@ -69,7 +73,13 @@ func (r *packageRevisionResources) NamespaceScoped() bool { // List selects resources in the storage which match to the selector. 'options' can be nil. func (r *packageRevisionResources) List(ctx context.Context, options *metainternalversion.ListOptions) (runtime.Object, error) { ctx, span := tracer.Start(ctx, "[START]::PackageRevisionResources::List", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration("PackageRevisionResources", "LIST", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRRMetricsName, "LIST") ctx = context1.WithNewRequestID(ctx) @@ -109,7 +119,13 @@ func (r *packageRevisionResources) List(ctx context.Context, options *metaintern // Get implements the Getter interface func (r *packageRevisionResources) Get(ctx context.Context, name string, _ *metav1.GetOptions) (runtime.Object, error) { ctx, span := tracer.Start(ctx, "[START]::PackageRevisionResources::Get", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration("PackageRevisionResources", "GET", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRRMetricsName, "GET") ctx = context1.WithNewRequestIDAndPackageRevision(ctx, name) @@ -136,7 +152,13 @@ func (r *packageRevisionResources) Get(ctx context.Context, name string, _ *meta func (r *packageRevisionResources) Update(ctx context.Context, name string, objInfo rest.UpdatedObjectInfo, _ rest.ValidateObjectFunc, updateValidation rest.ValidateObjectUpdateFunc, _ bool, _ *metav1.UpdateOptions) (runtime.Object, bool, error) { ctx, span := tracer.Start(ctx, "[START]::PackageRevisionResources::Update", trace.WithAttributes()) - defer span.End() + start := time.Now() + defer func() { + span.End() + metrics.RecordAPICallDuration("PackageRevisionResources", "UPDATE", time.Since(start).Seconds()) + }() + + metrics.RecordRequestCount(ctx, PRRMetricsName, "UPDATE") ctx = context1.WithNewRequestIDAndPackageRevision(ctx, name) diff --git a/scripts/common.sh b/scripts/common.sh index df0a23f43..f4c2106e9 100644 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -23,3 +23,10 @@ FN_RUNNER_WARM_UP_POD_CACHE=${FN_RUNNER_WARM_UP_POD_CACHE:-true} DB_PUSH_DRAFTS_TO_GIT=${DB_PUSH_DRAFTS_TO_GIT:-false} CREATE_V1ALPHA2_RPKG=${CREATE_V1ALPHA2_RPKG:-false} DEPLOYPORCHCONFIGDIR=${DEPLOYPORCHCONFIGDIR:-${PORCHDIR}/.build/deploy} + +DOT_ENV_PATH="${PORCHDIR}/.env" +if [ -f "$DOT_ENV_PATH" ]; then + export $(grep -v '^#' "$DOT_ENV_PATH" | xargs) +fi +DOCKERHUB_MIRROR="${DOCKERHUB_MIRROR:-docker.io}" + diff --git a/scripts/create-deployment-blueprint.sh b/scripts/create-deployment-blueprint.sh index 61fdff1d1..701c7e922 100755 --- a/scripts/create-deployment-blueprint.sh +++ b/scripts/create-deployment-blueprint.sh @@ -90,6 +90,10 @@ while [[ $# -gt 0 ]]; do GHCR_IMAGE_PREFIX="${2}" shift 2 ;; + --dockerhub-mirror) + DOCKERHUB_MIRROR="${2}" + shift 2 + ;; --fn-runner-warm-up-pod-cache) FN_RUNNER_WARM_UP_POD_CACHE="${2}" shift 2 @@ -281,7 +285,7 @@ function configure_porch_cache() { -- "source= for resource in ctx.resource_list['items']: podspec = resource['spec']['template']['spec'] - + # Update containers for container in podspec.get('containers', []): if 'envFrom' in container: @@ -411,10 +415,13 @@ function main() { "docker.io/nephio/porch-wrapper-server:latest" \ "${WRAPPER_SERVER_IMAGE}" - if [[ -n "${DOCKERHUB_MIRROR}" ]]; then + if [[ "${DOCKERHUB_MIRROR}" != "docker.io" ]]; then + customize_image \ + "postgres:17-alpine" \ + "${DOCKERHUB_MIRROR}/postgres:17-alpine" customize_image \ - "docker.io/bitnamilegacy/postgresql:17.6.0-debian-12-r4" \ - "${DOCKERHUB_MIRROR}/bitnamilegacy/postgresql:17.6.0-debian-12-r4" + "docker.io/bitnamilegacy/postgresql:17.6.0-debian-12-r4" \ + "${DOCKERHUB_MIRROR}/bitnamilegacy/postgresql:17.6.0-debian-12-r4" fi } diff --git a/scripts/deploy-monitoring.sh b/scripts/deploy-monitoring.sh new file mode 100755 index 000000000..0fcb996e3 --- /dev/null +++ b/scripts/deploy-monitoring.sh @@ -0,0 +1,277 @@ +#!/usr/bin/env bash +# Copyright 2024-2025 The Nephio Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +METRICS_DIR="${SCRIPT_DIR}/../deployments/metrics" +DOT_ENV_PATH="${SCRIPT_DIR}/../.env" + +if [ -f "$DOT_ENV_PATH" ]; then + export $(grep -v '^#' "$DOT_ENV_PATH" | xargs) +fi + +# Configuration +NAMESPACE="${NAMESPACE:-porch-monitoring}" +PROMETHEUS_NODEPORT="${PROMETHEUS_NODEPORT:-30091}" +GRAFANA_NODEPORT="${GRAFANA_NODEPORT:-30301}" + +DOCKERHUB_MIRROR="${DOCKERHUB_MIRROR:-docker.io}" +KRM_FN_REGISTRY_URL="${KRM_FN_REGISTRY_URL:-gcr.io/kptdev/krm-functions-catalog}" +PROMETHEUS_IMAGE="${DOCKERHUB_MIRROR}/prom/prometheus:latest" +GRAFANA_IMAGE="${DOCKERHUB_MIRROR}/grafana/grafana:latest" +PYROSCOPE_IMAGE="${DOCKERHUB_MIRROR}/grafana/pyroscope:latest" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +if ! command -v kubectl &> /dev/null; then + log_error "kubectl not found. Please install kubectl first." + exit 1 +fi + +check_kpt() { + if ! command -v kpt &> /dev/null; then + log_error "kpt not found. Please install kpt from: https://kpt.dev/installation/" + exit 1 + fi +} + +prepare_manifests() { + local temp_dir=$(mktemp -d) + cp -r "${METRICS_DIR}"/* "$temp_dir/" + + cat > "$temp_dir/Kptfile" < /dev/null 2>&1 + + echo "$temp_dir" +} + +apply_manifests() { + local manifests_dir=$1 + + log_info "Applying manifests using kpt live apply..." + if [ ! -f "$manifests_dir/resourcegroup.yaml" ]; then + log_info "Initializing kpt inventory..." + kpt live init "$manifests_dir" --namespace "$NAMESPACE" --name porch-monitoring + fi + + log_info "Running kpt live apply..." + kpt live apply "$manifests_dir" --reconcile-timeout=2m --output=events || { + log_warn "kpt live apply reconcile timeout - resources are deployed but may still be starting up" + } +} + +create_namespace() { + if kubectl get namespace "$NAMESPACE" &> /dev/null; then + log_info "Namespace $NAMESPACE already exists" + else + log_info "Creating namespace $NAMESPACE" + kubectl create namespace "$NAMESPACE" + fi +} + +deploy_monitoring() { + log_info "Deploying monitoring stack..." + log_info "Rendering manifests with kpt..." + + local manifests_dir + manifests_dir=$(prepare_manifests) + + kubectl create configmap prometheus-config \ + --from-file="${SCRIPT_DIR}/../deployments/metrics-resources/prometheus-config.yaml" \ + -n "$NAMESPACE" \ + --dry-run=client -o yaml | kubectl apply -f - + + kubectl create configmap grafana-dashboards \ + --from-file=grafana-perf-test-dashboard.json="${SCRIPT_DIR}/../deployments/metrics-resources/grafana-perf-test-dashboard.json" \ + --from-file=grafana-pyroscope-dashboard.json="${SCRIPT_DIR}/../deployments/metrics-resources/grafana-pyroscope-dashboard.json" \ + --from-file=grafana-porch-server-dashboard.json="${SCRIPT_DIR}/../deployments/metrics-resources/grafana-porch-server-dashboard.json" \ + --from-file=grafana-resource-usage-dashboard.json="${SCRIPT_DIR}/../deployments/metrics-resources/grafana-resource-usage-dashboard.json" \ + -n "$NAMESPACE" \ + --dry-run=client -o yaml | kubectl apply -f - + + apply_manifests "$manifests_dir" + + rm -rf "$manifests_dir" + + log_info "Monitoring stack deployed successfully" +} + +wait_for_deployment() { + local deployment=$1 + log_info "Waiting for $deployment to be ready..." + kubectl wait --for=condition=available --timeout=300s deployment/$deployment -n "$NAMESPACE" +} + +get_service_urls() { + log_info "Getting service URLs..." + log_info "Setting up port forwarding..." + + pkill -f "port-forward.*prometheus" 2>/dev/null || true + pkill -f "port-forward.*grafana" 2>/dev/null || true + pkill -f "port-forward.*pyroscope" 2>/dev/null || true + sleep 2 + + kubectl port-forward -n "${NAMESPACE}" svc/prometheus 9092:9090 > /dev/null 2>&1 & + PROMETHEUS_PF_PID=$! + kubectl port-forward -n "${NAMESPACE}" svc/grafana 3001:3000 > /dev/null 2>&1 & + GRAFANA_PF_PID=$! + kubectl port-forward -n "${NAMESPACE}" svc/pyroscope 4040:4040 > /dev/null 2>&1 & + PYROSCOPE_PF_PID=$! + + sleep 2 + + echo "${PROMETHEUS_PF_PID}" > /tmp/porch-prometheus-pf.pid + echo "${GRAFANA_PF_PID}" > /tmp/porch-grafana-pf.pid + echo "${PYROSCOPE_PF_PID}" > /tmp/porch-pyroscope-pf.pid + + PROMETHEUS_URL="http://localhost:9092" + GRAFANA_URL="http://localhost:3001" + PYROSCOPE_URL="http://localhost:4040" + PROMETHEUS_NODEPORT_URL="http://localhost:${PROMETHEUS_NODEPORT}" + GRAFANA_NODEPORT_URL="http://localhost:${GRAFANA_NODEPORT}" + + echo "" + log_info "==========================================" + log_info "Services deployed successfully!" + log_info "==========================================" + echo "" + log_info "Access via port-forward (recommended):" + log_info " Prometheus: ${PROMETHEUS_URL}" + log_info " Grafana: ${GRAFANA_URL}" + log_info " Username: admin" + log_info " Password: admin" + log_info " Pyroscope: ${PYROSCOPE_URL}" + echo "" + log_info "Or access via NodePort:" + log_info " Prometheus: ${PROMETHEUS_NODEPORT_URL}" + log_info " Grafana: ${GRAFANA_NODEPORT_URL}" + echo "" + log_info " - Prometheus UI on http://localhost:${PROMETHEUS_PORT}" + log_info " - porch-server metrics on port 9093" + log_info " - function-runner metrics on port 9094" + log_info "" + log_info "Note: Performance tests expose metrics on port 9095" + log_info " (typically at 172.17.0.1:9095 for scraping from within cluster)" + log_info " For kind clusters, tests scrape from host.docker.internal:9095" + log_info " (resolves to host machine from container)" + echo "" + log_info "To stop port forwarding:" + log_info ' kill $(cat /tmp/porch-prometheus-pf.pid /tmp/porch-grafana-pf.pid /tmp/porch-pyroscope-pf.pid 2>/dev/null)' + echo "" +} + +cleanup() { + log_warn "Cleaning up existing deployment..." + + if [ -f /tmp/porch-prometheus-pf.pid ] || [ -f /tmp/porch-grafana-pf.pid ] || [ -f /tmp/porch-pyroscope-pf.pid ]; then + log_info "Stopping port forwarding..." + kill $(cat /tmp/porch-prometheus-pf.pid /tmp/porch-grafana-pf.pid /tmp/porch-pyroscope-pf.pid 2>/dev/null) 2>/dev/null || true + rm -f /tmp/porch-prometheus-pf.pid /tmp/porch-grafana-pf.pid /tmp/porch-pyroscope-pf.pid + fi + + if kubectl get namespace "$NAMESPACE" &> /dev/null; then + log_info "Deleting resources in namespace $NAMESPACE..." + kubectl delete deployment prometheus grafana pyroscope -n "$NAMESPACE" --ignore-not-found=true + kubectl delete service prometheus grafana pyroscope -n "$NAMESPACE" --ignore-not-found=true + kubectl delete configmap prometheus-config grafana-dashboards grafana-dashboards-provider grafana-datasources -n "$NAMESPACE" --ignore-not-found=true + kubectl delete serviceaccount prometheus pyroscope -n "$NAMESPACE" --ignore-not-found=true + kubectl delete clusterrole prometheus -n "$NAMESPACE" --ignore-not-found=true + kubectl delete clusterrolebinding prometheus -n "$NAMESPACE" --ignore-not-found=true + + log_info "Deleting namespace $NAMESPACE..." + kubectl delete namespace "$NAMESPACE" --ignore-not-found=true + else + log_info "Namespace $NAMESPACE does not exist, nothing to clean up" + fi + + log_info "Cleanup completed" +} + +main() { + local action="${1:-deploy}" + case "$action" in + deploy) + log_info "Starting deployment of Prometheus and Grafana..." + check_kpt + create_namespace + deploy_monitoring + wait_for_deployment prometheus + wait_for_deployment grafana + wait_for_deployment pyroscope + get_service_urls + ;; + cleanup) + check_kpt + cleanup + ;; + restart) + check_kpt + cleanup + sleep 2 + main deploy + ;; + *) + log_error "Unknown action: $action" + echo "Usage: $0 {deploy|cleanup|restart}" + echo "" + echo "Environment variables:" + echo " NAMESPACE - Kubernetes namespace (default: porch-monitoring)" + echo " PROMETHEUS_NODEPORT - Prometheus NodePort (default: 30091)" + echo " GRAFANA_NODEPORT - Grafana NodePort (default: 30301)" + echo "" + echo "Requirements:" + echo " - kpt CLI (install from: https://kpt.dev/installation/)" + echo " - kubectl configured with cluster access" + exit 1 + ;; + esac +} +main "$@" + diff --git a/scripts/install-dev-gitea-setup.sh b/scripts/install-dev-gitea-setup.sh index f992ef30f..173a9d383 100755 --- a/scripts/install-dev-gitea-setup.sh +++ b/scripts/install-dev-gitea-setup.sh @@ -23,6 +23,14 @@ git_repo_name=${1:-porch-test} gitea_ip=${2:-172.18.255.200} # should be from the address range in deployments/local/metallb-conf.yaml git_root="$(readlink -f "${self_dir}/..")" + +DOT_ENV_PATH="${git_root}/.env" +if [ -f "$DOT_ENV_PATH" ]; then + export $(grep -v '^#' "$DOT_ENV_PATH" | xargs) +fi + +DOCKERHUB_MIRROR="${DOCKERHUB_MIRROR:-docker.io}" +SET_IMAGE_IMG="ghcr.io/kptdev/krm-functions-catalog/set-image:v0.1.1" TEST_BLUEPRINTS_PATH="${git_root}/test/pkgs/test-pkgs/test-blueprints.bundle" cd "${git_root}" @@ -104,7 +112,15 @@ else fi cd "${git_root}/.build/gitea" - + +# Update gitea images to use DOCKERHUB_MIRROR +if [ "${DOCKERHUB_MIRROR}" != "docker.io" ]; then + h1 "Updating gitea images to use DOCKERHUB_MIRROR: ${DOCKERHUB_MIRROR}" + kpt fn eval . --image "${SET_IMAGE_IMG}" -- \ + "name=gitea/gitea" \ + "newName=${DOCKERHUB_MIRROR}/gitea/gitea" +fi + # Check if the gitea service of type LoadBalancer exists in the 'gitea' namespace if kubectl get svc gitea-lb -n gitea --no-headers 2>/dev/null | grep -q LoadBalancer; then h1 Gitea LoadBalancer service exists. Skipping mutations diff --git a/test/e2e/performance/README.md b/test/e2e/performance/README.md deleted file mode 100644 index 14fa27c5a..000000000 --- a/test/e2e/performance/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# Iterative Performance Test - -## Setup - -Run `prometheus_in_docker.sh create`, this will create a docker network so that the container can reach the metrics server, -then starts a container of Prometheus reachable at `localhost:9090`. - -## Running - -You can see all arguments with their default values by running `go test -v ./... -help=true` in this directory. - -The `PERF` environment variable must be set to `1` in order for the test to start. - -### Examples - -`PERF=1 go test -v ./... -repos 1 -iterations 10,20,30` - -run 3 iterations, creating 10, then 20, then 30 control PackageRevisions in 1 Repository - -`PERF=1 go test -v ./... -repos 3 -iterations 10,20,10,30` - -run 4 iterations, creating 30 (10x3), then 60 (20x3), then 30 (10x3), then 90 (30*3) PackageRevisions across 3 Repositories - -## Teardown - -Run `prometheus_in_docker.sh clean`, this will delete the container and the network. - -## Example queries for Prometheus - -### propose (k8s client update) latency averaged over 1s + actual PR count - -``` -rate(porch_operation_duration_ms_sum{operation="propose"}[1s])/rate(porch_operation_duration_ms_count{operation="propose"}[1s]) -or -porch_package_revisions_count -``` - -### all k8s client operation latency averaged over 1s for control packages + actual PR count - -``` -rate(porch_operation_duration_ms_sum{name=~"iterative-.*"}[1s])/rate(porch_operation_duration_ms_count{name=~"iterative-.*"}[1s]) -or -porch_package_revisions_count -``` diff --git a/test/e2e/performance/config.yml b/test/e2e/performance/config.yml deleted file mode 100644 index 4524402b4..000000000 --- a/test/e2e/performance/config.yml +++ /dev/null @@ -1,12 +0,0 @@ -global: - scrape_interval: 0s100ms - evaluation_interval: 0s100ms - -scrape_configs: - - job_name: 'porch_metrics' - static_configs: - - targets: ['host.docker.internal:2113'] - scrape_interval: 0s100ms - -rule_files: - - "./rules.yml" \ No newline at end of file diff --git a/test/e2e/performance/iterative_test.go b/test/e2e/performance/iterative_test.go deleted file mode 100644 index a2e2db9b5..000000000 --- a/test/e2e/performance/iterative_test.go +++ /dev/null @@ -1,378 +0,0 @@ -package performance - -import ( - "encoding/json" - "flag" - "fmt" - "os" - "path/filepath" - "regexp" - "strconv" - "strings" - "sync" - "testing" - "time" - - porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" - suiteutils "github.com/nephio-project/porch/test/e2e/suiteutils" - "github.com/stretchr/testify/suite" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -const ( - controlRepoName = "iterative-control" - controlPackageName = "control-package" - testRepoName = "iterative-test" - testPackageName = "test-package" - - startingWS = "v1" -) - -var ( - repoCount = flag.Int("repos", 1, "Number of repositories to create") - iterations = flag.String("iterations", "10,25,50", "Number of control packages to create PER REPOSITORY for each iteration (e.g. \"10,25,50,100\")") - sampling = flag.Int("sampling", 5, "Number of measurements to take per iteration") - writeToFile = flag.Bool("write-to-file", false, "Write results to file") - outputDir = flag.String("output-dir", ".build/perf/iterative", "Where to write the results to if '-write-to-file' is set") - printToLog = flag.Bool("print-to-log", false, "Print results to log") - cooldown = flag.Duration("cooldown", 0, "Time to wait between iterations") -) - -type IterativeTest struct { - PerformanceSuite - - // entire test vars - metrics []FullMetricsData - iterations []int - sampling int - repos []string - cooldown time.Duration - - // per subtest vars - controlCount int - iterationIndex int -} - -func TestIterative(t *testing.T) { - if os.Getenv("PERF") != "1" { - t.Skip("PERF != 1") - } - test := &IterativeTest{} - test.UseGitea = true - suite.Run(t, test) -} - -func (t *IterativeTest) SetupSuite() { - t.parseFlags() // validate flags before starting the git server - t.PerformanceSuite.SetupSuite() - t.createRepos() - t.metrics = []FullMetricsData{} -} - -func (t *IterativeTest) parseFlags() { - flag.Parse() - its, err := toIntSlice(*iterations) - if err != nil { - t.Fatalf("failed to parse iterations: %v", err) - } - if len(its) == 0 { - t.Fatalf("Must specify at least one iteration (-iterations)") - } - t.iterations = its - - if *repoCount < 1 { - t.Fatalf("Repository count must be at least 1 (-repos)") - } - t.repos = []string{} - for i := range *repoCount { - t.repos = append(t.repos, fmt.Sprintf("%s-%d", controlRepoName, i+1)) - } - - t.sampling = *sampling - if t.sampling < 1 { - t.Fatalf("Test package count must be at least 1 (-sampling)") - } - - t.cooldown = *cooldown -} - -func toIntSlice(s string) ([]int, error) { - regex := regexp.MustCompile(`^\d+(,\d+)*$`) - if !regex.MatchString(s) { - return nil, fmt.Errorf("cannot parse %q as int slice", s) - } - var out []int - for _, v := range strings.Split(s, ",") { - i, err := strconv.Atoi(v) - if err != nil { - return nil, err - } - out = append(out, i) - } - return out, nil -} - -func (t *IterativeTest) createRepos() { - var wg sync.WaitGroup - for _, repo := range t.repos { - wg.Add(1) - go func() { - defer wg.Done() - t.RegisterGitRepositoryF(t.GetPorchTestRepoURL(), repo, repo, suiteutils.GiteaUser, suiteutils.GiteaPassword) - }() - } - t.RegisterGitRepositoryF(t.GetPorchTestRepoURL(), testRepoName, "iterativedir", suiteutils.GiteaUser, suiteutils.GiteaPassword) - wg.Wait() -} - -func (t *IterativeTest) SetupSubTest() { - times := make([]MyDuration, len(t.repos)) - var wg sync.WaitGroup - for i, repo := range t.repos { - wg.Add(1) - go func() { - defer wg.Done() - times[i] = Measure(func() { t.createPackageRevisions(t.controlCount, repo) }) - }() - } - wg.Wait() - var timeToCreateAllRevisions MyDuration = 0 - for _, v := range times { - timeToCreateAllRevisions += v - } - t.metrics = append(t.metrics, FullMetricsData{ - ControlRevisionCount: t.controlCount, - IterationIndex: t.iterationIndex, - CreateControlRevisionsTotal: timeToCreateAllRevisions, - CreateControlRevisionsAvg: AvgDuration(timeToCreateAllRevisions, t.controlCount), - }) -} - -func (t *IterativeTest) TearDownSubTest() { - timeToDeleteAllRevisions := Measure(func() { t.deletePackageRevisions() }) - t.metrics[len(t.metrics)-1].DeleteControlRevisionsTotal = timeToDeleteAllRevisions - t.metrics[len(t.metrics)-1].DeleteControlRevisionsAvg = AvgDuration(timeToDeleteAllRevisions, t.controlCount) -} - -func (t *IterativeTest) TestIterative() { - for i, n := range t.iterations { - t.controlCount = n - t.iterationIndex = i + 1 - subtestName := fmt.Sprintf("Iterative-%d-%d", t.iterationIndex, t.controlCount) - result := t.Run(subtestName, func() { - var iterationMetrics []IterationMetricsData - for range t.sampling { - currentMetrics := t.collectMetrics() - t.ensureTestPackagesDeleted() - iterationMetrics = append(iterationMetrics, *currentMetrics) - } - t.mergeMetrics(iterationMetrics) - }) - if !result { - t.Errorf("%s failed, stopping early", subtestName) - break - } - if *writeToFile { - t.writeResult(&t.metrics[len(t.metrics)-1]) - } - if i != len(t.iterations)-1 { - <-time.After(t.cooldown) - } - } - if *printToLog { - t.printResults() - } - if *writeToFile { - t.writeResults() - } -} - -func (t *IterativeTest) ensureTestPackagesDeleted() { - list := &porchapi.PackageRevisionList{} - t.ListE(list, client.InNamespace(t.Namespace), client.MatchingFields{"spec.packageName": testPackageName}) - for _, pr := range list.Items { - pr.Spec.Lifecycle = porchapi.PackageRevisionLifecycleDeletionProposed - t.UpdateApprovalL(&pr) - t.DeleteL(&pr) - } -} - -// mergeMetrics averages the gathered metrics of a single iteration, -// then appends these to the overall results -func (t *IterativeTest) mergeMetrics(iterationMetrics []IterationMetricsData) { - summed := iterationMetrics[0] - length := len(iterationMetrics) - for _, metric := range iterationMetrics[1:] { - summed.List += metric.List - summed.Create += metric.Create - summed.UpdateResources += metric.UpdateResources - summed.GetAfterResourceUpdate += metric.GetAfterResourceUpdate - summed.Propose += metric.Propose - summed.GetAfterPropose += metric.GetAfterPropose - summed.Approve += metric.Approve - summed.GetAfterPublish += metric.GetAfterPublish - summed.DeleteProposed += metric.DeleteProposed - summed.GetAfterProposeDelete += metric.GetAfterProposeDelete - summed.Delete += metric.Delete - } - summed.List = AvgDuration(summed.List, length) - summed.Create = AvgDuration(summed.Create, length) - summed.UpdateResources = AvgDuration(summed.UpdateResources, length) - summed.GetAfterResourceUpdate = AvgDuration(summed.GetAfterResourceUpdate, length) - summed.Propose = AvgDuration(summed.Propose, length) - summed.GetAfterPropose = AvgDuration(summed.GetAfterPropose, length) - summed.Approve = AvgDuration(summed.Approve, length) - summed.GetAfterPublish = AvgDuration(summed.GetAfterPublish, length) - summed.DeleteProposed = AvgDuration(summed.DeleteProposed, length) - summed.GetAfterProposeDelete = AvgDuration(summed.GetAfterProposeDelete, length) - summed.Delete = AvgDuration(summed.Delete, length) - t.metrics[len(t.metrics)-1].IterationMetricsData = summed -} - -// collectMetrics runs the inner part of the test, collecting the specified metrics -func (t *IterativeTest) collectMetrics() *IterationMetricsData { - output := &IterationMetricsData{} - var pr *porchapi.PackageRevision - - output.List = Measure(func() { t.ListF(&porchapi.PackageRevisionList{}) }) - - output.Create = Measure(func() { pr = t.CreatePackageDraftF(testRepoName, testPackageName, startingWS) }) - t.T().Cleanup(func() { t.DeleteL(pr) }) - - resources := &porchapi.PackageRevisionResources{} - t.GetF(client.ObjectKey{Namespace: t.Namespace, Name: pr.Name}, resources) - resources.Spec.Resources["README.md"] = "# updated readme" - output.UpdateResources = Measure(func() { t.UpdateF(resources) }) - - output.GetAfterResourceUpdate = Measure(func() { t.GetF(client.ObjectKey{Namespace: t.Namespace, Name: pr.Name}, pr) }) - - pr.Spec.Lifecycle = porchapi.PackageRevisionLifecycleProposed - output.Propose = Measure(func() { t.UpdateF(pr) }) - - output.GetAfterPropose = Measure(func() { t.GetF(client.ObjectKey{Namespace: t.Namespace, Name: pr.Name}, pr) }) - - pr.Spec.Lifecycle = porchapi.PackageRevisionLifecyclePublished - output.Approve = Measure(func() { t.UpdateApprovalF(pr) }) - - output.GetAfterPublish = Measure(func() { t.GetF(client.ObjectKey{Namespace: t.Namespace, Name: pr.Name}, pr) }) - - pr.Spec.Lifecycle = porchapi.PackageRevisionLifecycleDeletionProposed - output.DeleteProposed = Measure(func() { t.UpdateApprovalF(pr) }) - - output.GetAfterProposeDelete = Measure(func() { t.GetF(client.ObjectKey{Namespace: t.Namespace, Name: pr.Name}, pr) }) - - output.Delete = Measure(func() { t.DeleteF(pr) }) - - return output -} - -// createPackageRevisions creates and publishes n number of control PackageRevisions. -// These will all be revisions of the same Package -func (t *IterativeTest) createPackageRevisions(n int, repoName string) { - pr := t.CreatePackageDraftF(repoName, controlPackageName, startingWS) - t.publishPackageRevision(pr) - - for i := 2; i <= n; i++ { - pr = t.createNewPackageRevisionFrom(pr, repoName, i) - t.publishPackageRevision(pr) - } -} - -// deletePackageRevisions deletes all PackageRevisions in the test cluster, -// including proposing deletion. -func (t *IterativeTest) deletePackageRevisions() { - var wg sync.WaitGroup - for _, repo := range append(t.repos, testRepoName) { - wg.Add(1) - go func() { - defer wg.Done() - prs := &porchapi.PackageRevisionList{} - t.ListE(prs, client.MatchingFields{"spec.repository": repo}) - for _, pr := range prs.Items { - pr.Spec.Lifecycle = porchapi.PackageRevisionLifecycleDeletionProposed - t.UpdateApprovalL(&pr) - t.DeleteL(&pr) - } - }() - } - wg.Wait() -} - -// createNewPackageRevisionFrom takes a PackageRevision and creates a new PackageRevision from it -func (t *IterativeTest) createNewPackageRevisionFrom(pr *porchapi.PackageRevision, repoName string, i int) *porchapi.PackageRevision { - newPr := &porchapi.PackageRevision{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: pr.Namespace, - }, - Spec: porchapi.PackageRevisionSpec{ - PackageName: pr.Spec.PackageName, - WorkspaceName: fmt.Sprintf("v%d", i), - RepositoryName: repoName, - Tasks: pr.Spec.Tasks, - Lifecycle: porchapi.PackageRevisionLifecycleDraft, - }, - } - - t.CreateF(newPr) - return newPr -} - -// publishPackageRevision proposes, then publishes an already existing draft PackageRevision -func (t *IterativeTest) publishPackageRevision(pr *porchapi.PackageRevision) { - pr.Spec.Lifecycle = porchapi.PackageRevisionLifecycleProposed - t.UpdateF(pr) - - pr.Spec.Lifecycle = porchapi.PackageRevisionLifecyclePublished - t.UpdateApprovalF(pr) -} - -// printResults prints all the gathered metrics to the log -func (t *IterativeTest) printResults() { - for _, metrics := range t.metrics { - t.Logf("results for %d revisions:\n%+v", metrics.ControlRevisionCount, metrics) - } -} - -// writeResult writes the result of one iteration to a json file -func (t *IterativeTest) writeResult(data *FullMetricsData) { - if err := t.ensureOutputDir(); err != nil { - return - } - filename := fmt.Sprintf("%d-%d.json", data.IterationIndex, data.ControlRevisionCount) - filepth := filepath.Join(*outputDir, filename) - t.write(filepth, data) -} - -// writeResults writes the results of all iteration to a combined json file -func (t *IterativeTest) writeResults() { - if err := t.ensureOutputDir(); err != nil { - return - } - filepth := filepath.Join(*outputDir, "full.json") - t.write(filepth, t.metrics) -} - -func (t *IterativeTest) write(filename string, data any) { - file, err := os.Create(filename) - if err != nil { - t.Logf("unable to create %s: %v", filename, err) - return - } - marshalled, err := json.MarshalIndent(data, "", " ") - if err != nil { - t.Logf("failed to marshal metrics: %v", err) - return - } - if _, err = file.Write(marshalled); err != nil { - t.Logf("failed to write to %s: %v", filename, err) - } -} - -func (t *IterativeTest) ensureOutputDir() error { - if err := os.MkdirAll(*outputDir, 0775); err != nil { - t.Logf("unable to create %s: %v", *outputDir, err) - return err - } - return nil -} diff --git a/test/e2e/performance/iterative_types.go b/test/e2e/performance/iterative_types.go deleted file mode 100644 index 0f86e1983..000000000 --- a/test/e2e/performance/iterative_types.go +++ /dev/null @@ -1,61 +0,0 @@ -package performance - -import ( - "fmt" - "time" -) - -// MyDuration is a wrapper around time.Duration for specifically handling time in (float) milliseconds -type MyDuration time.Duration - -func (d MyDuration) MarshalJSON() ([]byte, error) { - return []byte(fmt.Sprintf("%.4f", d.Milliseconds())), nil -} - -func (d MyDuration) Milliseconds() float64 { - return float64(d) / 1_000_000 -} - -func Measure(f func()) MyDuration { - start := time.Now() - f() - return MyDuration(time.Since(start)) -} - -func AvgDuration(total MyDuration, n int) MyDuration { - return MyDuration(int64(total) / int64(n)) -} - -type IterationMetricsData struct { - List, - Create, - UpdateResources, - GetAfterResourceUpdate, - Propose, - GetAfterPropose, - Approve, - GetAfterPublish, - DeleteProposed, - GetAfterProposeDelete, - Delete MyDuration -} - -type FullMetricsData struct { - ControlRevisionCount int - IterationIndex int - - CreateControlRevisionsTotal, - CreateControlRevisionsAvg, - DeleteControlRevisionsTotal, - DeleteControlRevisionsAvg MyDuration - - IterationMetricsData `json:",inline"` - - //ServerMemoryBytes int64 - //ControllersMemoryBytes int64 - //FuncRunnerMemoryBytes int64 - // - //ServerAvgCPULoad float32 - //ControllersAvgCPULoad float32 - //FuncRunnerAvgCPULoad float32 -} diff --git a/test/e2e/performance/performance_suite.go b/test/e2e/performance/performance_suite.go deleted file mode 100644 index a6272e553..000000000 --- a/test/e2e/performance/performance_suite.go +++ /dev/null @@ -1,229 +0,0 @@ -// Copyright 2025-2026 The kpt and Nephio Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package performance - -import ( - "flag" - "fmt" - "net/http" - "time" - - porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" - suiteutils "github.com/nephio-project/porch/test/e2e/suiteutils" - "github.com/prometheus/client_golang/prometheus/promhttp" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -var ( - metricsPort = flag.Uint("port", 2113, "Port on which to expose metrics") -) - -type PerformanceSuite struct { - suiteutils.TestSuiteWithGit - - metricsServer *http.Server - metricsShutdown chan struct{} -} - -func (t *PerformanceSuite) SetupSuite() { - flag.Parse() - t.TestSuiteWithGit.SetupSuite() - t.metricsServer = &http.Server{Addr: fmt.Sprintf("127.0.0.1:%d", *metricsPort)} - t.metricsShutdown = make(chan struct{}) - t.ServeMetrics() -} - -func (t *PerformanceSuite) TearDownSuite() { - t.ShutdownMetrics() -} - -func (t *PerformanceSuite) ServeMetrics() { - go func() { - t.Logf("Starting metrics server") - http.Handle("/metrics", promhttp.Handler()) - if err := t.metricsServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { - t.Fatalf("Error starting metrics server: %v", err) - } - t.Logf("Metrics server stopped") - t.metricsShutdown <- struct{}{} - }() -} - -// ShutdownMetrics tries to gracefully shut down the metrics server -func (t *PerformanceSuite) ShutdownMetrics() { - err := t.metricsServer.Shutdown(t.GetContext()) - if err != nil { - t.Logf("Error shutting down metrics server: %v", err) - } - select { - case <-t.metricsShutdown: - t.Logf("Metrics server shutdown complete") - case <-time.After(5 * time.Second): - t.Logf("Metrics server shutdown timed out") - } -} - -func (t *PerformanceSuite) incrementGuage(obj client.Object) { - if !t.T().Failed() { - switch KindOf(obj) { - case KindPackageRevision: - if obj.(*porchapi.PackageRevision).Spec.Revision != -1 { - packageRevisionGuage.Inc() - } - case KindRepository: - repositoryGauge.Inc() - } - } -} - -func (t *PerformanceSuite) decrementGuage(obj client.Object) { - if !t.T().Failed() { - switch KindOf(obj) { - case KindPackageRevision: - if obj.(*porchapi.PackageRevision).Spec.Revision != -1 { - packageRevisionGuage.Dec() - } - case KindRepository: - repositoryGauge.Dec() - } - } -} - -func (t *PerformanceSuite) GetE(key client.ObjectKey, obj client.Object) { - t.T().Helper() - MeasureAndRecord(OperationGet, obj, func() { t.TestSuiteWithGit.GetE(key, obj) }) -} - -func (t *PerformanceSuite) GetF(key client.ObjectKey, obj client.Object) { - t.T().Helper() - MeasureAndRecord(OperationGet, obj, func() { t.TestSuiteWithGit.GetF(key, obj) }) -} - -func (t *PerformanceSuite) ListE(list client.ObjectList, opts ...client.ListOption) { - t.T().Helper() - MeasureAndRecord(OperationList, list, func() { t.TestSuiteWithGit.ListE(list, opts...) }) -} - -func (t *PerformanceSuite) ListF(list client.ObjectList, opts ...client.ListOption) { - t.T().Helper() - MeasureAndRecord(OperationList, list, func() { t.TestSuiteWithGit.ListF(list, opts...) }) -} - -func (t *PerformanceSuite) CreateF(obj client.Object, opts ...client.CreateOption) { - t.T().Helper() - MeasureAndRecord(OperationCreate, obj, func() { t.TestSuiteWithGit.CreateF(obj, opts...) }) - - t.incrementGuage(obj) -} - -func (t *PerformanceSuite) CreateE(obj client.Object, opts ...client.CreateOption) { - t.T().Helper() - MeasureAndRecord(OperationCreate, obj, func() { t.TestSuiteWithGit.CreateE(obj, opts...) }) - - t.incrementGuage(obj) -} - -func (t *PerformanceSuite) DeleteF(obj client.Object, opts ...client.DeleteOption) { - t.T().Helper() - MeasureAndRecord(OperationDelete, obj, func() { t.TestSuiteWithGit.DeleteF(obj, opts...) }) - - t.decrementGuage(obj) -} - -func (t *PerformanceSuite) DeleteE(obj client.Object, opts ...client.DeleteOption) { - t.T().Helper() - MeasureAndRecord(OperationDelete, obj, func() { t.TestSuiteWithGit.DeleteE(obj, opts...) }) - - t.decrementGuage(obj) -} - -func (t *PerformanceSuite) DeleteL(obj client.Object, opts ...client.DeleteOption) { - t.T().Helper() - hadError := false - handler := func(format string, args ...any) { - hadError = true - t.Logf(format, args...) - } - MeasureAndRecord(OperationDelete, obj, func() { t.DeleteEH(obj, handler, opts...) }) - if !hadError { - t.decrementGuage(obj) - } -} - -func (t *PerformanceSuite) UpdateF(obj client.Object, opts ...client.UpdateOption) { - t.T().Helper() - MeasureAndRecord(getUpdateOperation(obj), obj, func() { t.TestSuiteWithGit.UpdateF(obj, opts...) }) -} - -func (t *PerformanceSuite) UpdateE(obj client.Object, opts ...client.UpdateOption) { - t.T().Helper() - MeasureAndRecord(getUpdateOperation(obj), obj, func() { t.TestSuiteWithGit.UpdateE(obj, opts...) }) -} - -func getUpdateOperation(obj client.Object) Operation { - if pr, ok := obj.(*porchapi.PackageRevision); ok && pr.Spec.Lifecycle == porchapi.PackageRevisionLifecycleProposed { - return OperationPropose - } - return OperationUpdate -} - -func (t *PerformanceSuite) PatchF(obj client.Object, patch client.Patch, opts ...client.PatchOption) { - t.T().Helper() - MeasureAndRecord(OperationPatch, obj, func() { t.TestSuiteWithGit.PatchF(obj, patch, opts...) }) -} - -func (t *PerformanceSuite) PatchE(obj client.Object, patch client.Patch, opts ...client.PatchOption) { - t.T().Helper() - MeasureAndRecord(OperationPatch, obj, func() { t.TestSuiteWithGit.PatchE(obj, patch, opts...) }) -} - -func (t *PerformanceSuite) UpdateApprovalL(pr *porchapi.PackageRevision) *porchapi.PackageRevision { - t.T().Helper() - var ret *porchapi.PackageRevision - MeasureAndRecord(getUpdateApprovalOperation(pr), pr, func() { ret = t.TestSuiteWithGit.UpdateApprovalL(pr) }) - return ret -} - -func (t *PerformanceSuite) UpdateApprovalF(pr *porchapi.PackageRevision) *porchapi.PackageRevision { - t.T().Helper() - var ret *porchapi.PackageRevision - MeasureAndRecord(getUpdateApprovalOperation(pr), pr, func() { ret = t.TestSuiteWithGit.UpdateApprovalL(pr) }) - return ret -} - -func getUpdateApprovalOperation(pr *porchapi.PackageRevision) Operation { - switch pr.Spec.Lifecycle { - case porchapi.PackageRevisionLifecyclePublished: - return OperationPublish - case porchapi.PackageRevisionLifecycleDeletionProposed: - return OperationProposeDelete - default: - return OperationUpdateApproval - } -} - -// copied, so the operation is recorded -func (t *PerformanceSuite) CreatePackageDraftF(repository, packageName, workspace string) *porchapi.PackageRevision { - t.T().Helper() - pr := t.CreatePackageSkeleton(repository, packageName, workspace) - pr.Spec.Tasks = []porchapi.Task{ - { - Type: porchapi.TaskTypeInit, - Init: &porchapi.PackageInitTaskSpec{}, - }, - } - t.CreateF(pr) - return pr -} diff --git a/test/e2e/performance/prometheus.go b/test/e2e/performance/prometheus.go deleted file mode 100644 index 8fd740cff..000000000 --- a/test/e2e/performance/prometheus.go +++ /dev/null @@ -1,84 +0,0 @@ -package performance - -import ( - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" -) - -type Operation string - -const ( - OperationGet Operation = "get" - OperationCreate Operation = "create" - OperationUpdate Operation = "update" - OperationUpdateApproval Operation = "update-approval" - OperationPatch Operation = "patch" - OperationPropose Operation = "propose" - OperationPublish Operation = "publish" - OperationProposeDelete Operation = "propose-delete" - OperationDelete Operation = "delete" - OperationList Operation = "list" -) - -const ( - LabelKind = "kind" - LabelOperation = "operation" - LabelName = "name" -) - -var labels = []string{LabelKind, LabelOperation, LabelName} - -var ( - KindPackageRevision = "PackageRevision" - KindPackageRevisionResources = "PackageRevisionResources" - KindRepository = "Repository" -) - -var ( - // Operation duration metrics - operationDuration = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "porch_operation_duration_ms", - Help: "Duration of Porch operations in milliseconds", - // Buckets: []float64{math.Inf(1)}, - Buckets: []float64{10, 25, 50, 100, 250}, - }, - labels, - ) - - // Operation counter metrics - operationCounter = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "porch_operations_total", - Help: "Total number of Porch operations", - }, - labels, - ) - - // How many repositories there are - repositoryGauge = promauto.NewGauge( - prometheus.GaugeOpts{ - Name: "porch_repositories_count", - Help: "Total number of repositories present", - }, - ) - - // How many package revisions there are - packageRevisionGuage = promauto.NewGauge( - prometheus.GaugeOpts{ - Name: "porch_package_revisions_count", - Help: "Total number of package revisions present", - }, - ) -) - -// RecordMetric records both duration and count for an operation -func RecordMetric(kind string, operation Operation, name string, duration float64) { - operationDuration.WithLabelValues(kind, string(operation), name).Observe(duration) - operationCounter.WithLabelValues(kind, string(operation), name).Inc() -} - -func MeasureAndRecord(op Operation, obj any, fn func()) { - time := Measure(func() { fn() }) - RecordMetric(KindOf(obj), op, NameOf(obj), time.Milliseconds()) -} diff --git a/test/e2e/performance/prometheus_in_docker.sh b/test/e2e/performance/prometheus_in_docker.sh deleted file mode 100755 index a8959aa2b..000000000 --- a/test/e2e/performance/prometheus_in_docker.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash - -SCRIPT_DIR=$(dirname $(readlink -e ${BASH_SOURCE[0]})) - -function create() { - if [[ ! $(docker network ls | grep prometheus) ]]; then - docker network create prometheus - fi - - docker run \ - -d \ - --name prometheus \ - --network prometheus \ - --add-host host.docker.internal:host-gateway \ - -p 9090:9090 \ - -v $SCRIPT_DIR/config.yml:/etc/prometheus/prometheus.yml \ - -v $SCRIPT_DIR/rules.yml:/etc/prometheus/rules.yml \ - prom/prometheus:v3.2.1 -} - -function clean() { - docker container stop prometheus ||: - docker container rm prometheus ||: - docker network rm prometheus ||: -} - -if [[ $# -ne 1 ]]; then - echo "specify 'create' or 'clean'" - exit 1 -fi - -case $1 in - create) - create - ;; - clean) - clean - ;; - *) - echo "$1 is not a valid operation, specify 'create' or 'clean'" - exit 1 - ;; -esac diff --git a/test/e2e/performance/rules.yml b/test/e2e/performance/rules.yml deleted file mode 100644 index 44cb98066..000000000 --- a/test/e2e/performance/rules.yml +++ /dev/null @@ -1,47 +0,0 @@ -groups: - - name: porch_rules - interval: 10s - rules: - - record: porch:package_revisions_count - expr: porch_package_revisions_count - - # Average duration for create operations - - record: porch:create_operation_duration_seconds - expr: rate(porch_operation_duration_ms_sum{operation="create"}[1m]) / rate(porch_operation_duration_ms_count{operation="create"}[1m]) - labels: - operation: create - - # Average duration for propose operations - - record: porch:propose_operation_duration_seconds - expr: rate(porch_operation_duration_ms_sum{operation="propose"}[1m]) / rate(porch_operation_duration_ms_count{operation="propose"}[1m]) - labels: - operation: propose - - # Average duration for approve operations - - record: porch:approve_operation_duration_seconds - expr: rate(porch_operation_duration_ms_sum{operation="approve"}[1m]) / rate(porch_operation_duration_ms_count{operation="approve"}[1m]) - labels: - operation: approve - - - - name: porch_alerts - rules: - # Alert for high propose operation latency - - alert: HighProposeLatency - expr: rate(porch_operation_duration_ms_sum{operation="propose"}[1m]) / rate(porch_operation_duration_ms_count{operation="propose"}[1m]) > 500 - for: 10s - labels: - severity: warning - annotations: - summary: "High latency on propose operations" - description: "Propose operation latency is above 500ms for more than 10 seconds." - - # Alert for high create operation latency - - alert: HighCreateLatency - expr: rate(porch_operation_duration_ms_sum{operation="create"}[1m]) / rate(porch_operation_duration_ms_count{operation="create"}[1m]) > 500 - for: 10s - labels: - severity: warning - annotations: - summary: "High latency on create operations" - description: "Create operation latency is above 500ms for more than 10 seconds." \ No newline at end of file diff --git a/test/e2e/performance/utils.go b/test/e2e/performance/utils.go deleted file mode 100644 index 3d0ed3920..000000000 --- a/test/e2e/performance/utils.go +++ /dev/null @@ -1,43 +0,0 @@ -package performance - -import ( - "fmt" - "reflect" - - porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" - configapi "github.com/nephio-project/porch/api/porchconfig/v1alpha1" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type hasName interface { - GetName() string -} - -func KindOf(o any) string { - if c, ok := o.(client.Object); ok { - if kind := c.GetObjectKind().GroupVersionKind().Kind; kind != "" { - return kind - } - } - // is regular type - if name := reflect.TypeOf(o).Name(); name != "" { - return name - } - // is pointer - return reflect.TypeOf(o).Elem().Name() -} - -func NameOf(o any) string { - switch c := o.(type) { - case *porchapi.PackageRevision: - return fmt.Sprintf("%s/%s", c.Spec.RepositoryName, c.Spec.PackageName) - case *porchapi.PackageRevisionResources: - return fmt.Sprintf("%s/%s", c.Spec.RepositoryName, c.Spec.PackageName) - case *configapi.Repository: - return c.Name - case hasName: - return c.GetName() - default: - return "" - } -} diff --git a/test/performance/README.md b/test/performance/README.md index 4953f69fa..df8bb15f3 100644 --- a/test/performance/README.md +++ b/test/performance/README.md @@ -1,11 +1,9 @@ -Copyright 2024, 2026 The Nephio Authors +Copyright 2026 The Nephio Authors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - +http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,13 +16,14 @@ limitations under the License. ## Prerequisites - Docker - Kubernetes CLI (kubectl) +- Kpt CLI - Go development environment - Access to GitHub repositories ## 1. Set Up Development Environment -Run the following script to set up the development environment: +Run the following script from the porch directory to set up the development environment: ```bash -https://github.com/nephio-project/porch/blob/main/scripts/setup-dev-env.sh +./scripts/setup-dev-env.sh ``` ## 2. Build and Deploy Porch @@ -34,103 +33,124 @@ make run-in-kind ``` Once deployment is complete, you should see the related pods running. -## 3. Create a Demo Namespace -Create a namespace for the demo setup: -```bash -kubectl create namespace porch-demo -``` +## 3. Deploy Monitoring Stack (Optional) -## 4. Define Gitea Repositories in Porch -Apply the repository configuration file: ```bash -kubectl apply -f examples/tutorials/starting-with-porch/porch-repositories.yaml +./scripts/deploy-monitoring.sh deploy ``` -Note: The Gitea credentials secret is automatically created when running the tests. The configuration can be found in `gitea-secret.yaml`. +This will: +- Create a `porch-monitoring` namespace +- Deploy Prometheus on port 9090 inside the cluster +- Deploy Grafana on port 3000 inside the cluster +- Configure Prometheus as a Grafana datasource +- Load the Porch performance dashboard +- Setup port-forwarding: Prometheus on localhost:9092, Grafana on localhost:3001 + +**Important for kind clusters**: Prometheus is configured to scrape metrics from the host machine via the Docker gateway IP (172.17.0.1). The performance test exposes metrics on port 9095 on the host, and Prometheus running inside the kind cluster accesses them via `172.17.0.1:9095`. + +### Access the Monitoring Tools -## 5. Verify Repository Setup -Check the status of repositories in the porch-demo namespace: +- **Prometheus**: http://localhost:9092 (via port-forward) +- **Grafana**: http://localhost:3001/dashboards + +### Cleanup Monitoring Stack ```bash -kubectl get repositories -n porch-demo +./scripts/deploy-monitoring.sh cleanup ``` +This will delete all monitoring resources and stop port forwarding. -## 6. Run Performance Tests -Navigate to the performance test directory: +### Restart Monitoring Stack ```bash -cd test/performance/ +./scripts/deploy-monitoring.sh restart ``` -Execute the performance test command: +## 4. Run Performance Tests +Navigate to the performance test directory: ```bash -E2E=1 go test -v ./... -repos=4 -packages=4 -timeout 60m +cd test/performance/ ``` -Parameters: -- `repos`: Number of repositories to test -- `packages`: Number of package revisions in each repository +There are two main performance tests available: -Once the tests begin, the logs can be observed in logs dir a sample is shared below. -``` -=== Iteration 0 Results === -Operation Duration Status --------------------------------------------------- -Create Gitea Repository 331ms Success -Create Porch Repository 7ms Success -Wait for Porch Repository Ready 2.008s Success - -=== Iteration 1 Results === -Operation Duration Status --------------------------------------------------- -Create PackageRevision 457ms Success -Update to Proposed 561ms Success -Update to Published 699ms Success -Delete PackageRevision 3.171s Success - -=== Iteration 1 Results === -Operation Duration Status --------------------------------------------------- -Delete Repository 5ms Success - -=== Consolidated Performance Test Results === -Operation Min Max Avg Total ------------------------------------------------------------------------- -Update to Published 699ms 699ms 699ms 699ms -Delete PackageRevision 3.171s 3.171s 3.171s 3.171s -Create Gitea Repository 331ms 331ms 331ms 331ms -Create Porch Repository 7ms 7ms 7ms 7ms -Wait for Porch Repository Ready 2.008s 2.008s 2.008s 2.008s -Create PackageRevision 457ms 457ms 457ms 457ms -Update to Proposed 561ms 561ms 561ms 561ms +### Run Load Test +Creates a preset amount of repositories, packages, and revisions to simulate load on the Porch system. +```bash +LOAD_TEST=1 go test -v ./... -timeout 1h ``` +Make sure to scale the timeout as needed and adjust test parameters as required. +Namely `-repos`, `-packages`, and `-revisions` to simulate the desired load and `-enable-prometheus` to enable visual metrics. -## 7. Verify Prometheus Targets -Check the Prometheus targets: -``` -http://localhost:9090/targets +### Maximum Package Revisions Test +Tests the maximum number of package revisions that can be handled by Porch in a single repository. +```bash +MAX_PR_TEST=1 go test -v ./... -timeout 1h ``` -You should be able to run PromQL queries here. +Make sure to scale the timeout as needed and adjust test parameters as required -> recommended is 72h. + +### Available Test Parameters: +- `-namespace`: Kubernetes namespace to use for the test (default: `porch-metrics`) +- `-repos`: Number of repositories to test +- `-packages`: Number of packages per repository +- `-revisions`: Number of package revisions per package +- `-repo-parallelism`: Number of repositories to create in parallel +- `-package-parallelism`: Number of packages to create in parallel per repository +- `-enable-deletion`: Enable deletion of package revisions at the end of the test +- `-error-rate`: Maximum percentage of package revisions allowed to fail lifecycle transition +- `-enable-prometheus`: Enable Prometheus metrics server (default: false) -> **do not enable if monitoring stack is not deployed** +- `-prr-padding-mb`: Amount of padding data in MB to add to each PackageRevisionResource to increase its size and test performance with larger resources +- `-metrics-log-prefix`: Prefix for the timestamped metrics log file +- `-results-file`: File name for test results +- `-detailed-log-file`: File name for detailed log +- `-repo-results-csv`: File name for repository results CSV +- `-operations-csv`: File name for operations details CSV +- `-deletion-csv`: File name for deletion operations CSV +- `-kptfile-path`: Path to the Kptfile +- `-package-resources-path`: Path to the package resources + +All results are stored in logs files located in the `test/performance/` and `test/performance/logs` directories. + +## 5. Sample Output -## 8. Check Metrics Output -Retrieve the captured metrics using: ```bash -curl -k http://localhost:2113/metrics +LOAD_TEST=1 go test -v ./test/performance -namespace=db-demo -repos=1 -packages=1 -revisions=3 -enable-prometheus=true -enable-deletion=true -timeout 1h ``` -This should return all the collected performance metrics from the test execution. -## Prometheus Setup - -The performance testing framework uses a Kubernetes-based Prometheus setup. The configuration is stored in `prometheus-manifests.yaml` and includes: - -- ConfigMap with Prometheus configuration -- Deployment for the Prometheus pod -- Service to expose Prometheus - -The Prometheus instance is automatically deployed in the `porch-demo` namespace when running the tests. You can access the Prometheus UI at: ``` -http://localhost:9090 +=== Consolidated Performance Test Results === +Operation Min Max Avg Total +------------------------------------------------------------------------------------ +Create Gitea Repository R0 272ms 272ms 272ms 272ms +Create Porch Repository R0 3ms 3ms 3ms 3ms +Repository Ready Wait R0 2.004s 2.004s 2.004s 2.004s +Package Revision List v1 9ms 9ms 9ms 9ms +Package Revision List v2 6ms 6ms 6ms 6ms +Package Revision List v3 6ms 6ms 6ms 6ms +Package Revision Create v1 10ms 10ms 10ms 10ms +Package Revision Create v2 9ms 9ms 9ms 9ms +Package Revision Create v3 9ms 9ms 9ms 9ms +Package Revision Get Resources v1 3ms 3ms 3ms 3ms +Package Revision Get Resources v2 3ms 3ms 3ms 3ms +Package Revision Get Resources v3 3ms 3ms 3ms 3ms +Package Revision Update v1 10ms 10ms 10ms 10ms +Package Revision Update v2 10ms 10ms 10ms 10ms +Package Revision Update v3 9ms 9ms 9ms 9ms +Package Revision Get v1 2ms 2ms 2ms 2ms +Package Revision Get v2 2ms 2ms 2ms 2ms +Package Revision Get v3 2ms 2ms 2ms 2ms +Package Revision Propose v1 11ms 11ms 11ms 11ms +Package Revision Propose v2 9ms 9ms 9ms 9ms +Package Revision Propose v3 10ms 10ms 10ms 10ms +Package Revision Get (Proposed) v1 2ms 2ms 2ms 2ms +Package Revision Get (Proposed) v2 2ms 2ms 2ms 2ms +Package Revision Get (Proposed) v3 3ms 3ms 3ms 3ms +Package Revision Approve/Publish v1 348ms 348ms 348ms 348ms +Package Revision Approve/Publish v2 339ms 339ms 339ms 339ms +Package Revision Approve/Publish v3 340ms 340ms 340ms 340ms +Package Revision Propose Deletion v1 8ms 8ms 8ms 8ms +Package Revision Propose Deletion v2 6ms 6ms 6ms 6ms +Package Revision Propose Deletion v3 9ms 9ms 9ms 9ms +Package Revision Delete v1 262ms 262ms 262ms 262ms +Package Revision Delete v2 262ms 262ms 262ms 262ms +Package Revision Delete v3 269ms 269ms 269ms 269ms ``` - -The configuration can be modified by editing `prometheus-manifests.yaml`. Key settings include: -- Scrape interval: 1s -- Resource limits: 512Mi memory, 200m CPU -- NodePort: 30090 \ No newline at end of file diff --git a/test/performance/csv_generation.go b/test/performance/csv_generation.go new file mode 100644 index 000000000..39d8bdfbe --- /dev/null +++ b/test/performance/csv_generation.go @@ -0,0 +1,346 @@ +// Copyright 2026 The Nephio Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "encoding/csv" + "fmt" + "os" + "sort" + "time" + + pkgerrors "github.com/pkg/errors" +) + +func (t *PerfTestSuite) generateCSVResults() error { + csvFile, err := os.Create(t.csvOptions.lifecycleCSV) + if err != nil { + return pkgerrors.Wrapf(err, "failed to create CSV file %s", t.csvOptions.lifecycleCSV) + } + defer func() { _ = csvFile.Close() }() + + writer := csv.NewWriter(csvFile) + defer writer.Flush() + + header := []string{} + for i := 0; i < t.testOptions.numRepos; i++ { + repoSuffix := fmt.Sprintf("%0*d", len(fmt.Sprintf("%d", t.testOptions.numRepos)), i) + header = append(header, + fmt.Sprintf("REPO-%s-PKG:REV", repoSuffix), + fmt.Sprintf("REPO-%s-TOTAL-LIFECYCLE-DURATION", repoSuffix), + ) + } + + if err := writer.Write(header); err != nil { + return err + } + + type pkgRevResult struct { + pkgName string + revision int + totalDur time.Duration + } + + repoResults := make(map[string][]pkgRevResult) + + t.metricsMutex.RLock() + for repoName, testMetric := range t.metrics { + for pkgName, revisions := range testMetric.pkgRevMetrics { + for revNum, revMetrics := range revisions { + var totalLifecycleDur time.Duration + lifecycleComplete := false + + if publishedOp, ok := revMetrics.Metrics[pkgRevPublished]; ok && publishedOp.Error == nil { + lifecycleComplete = true + } + + if lifecycleComplete { + for opKey, opMetric := range revMetrics.Metrics { + if opMetric.Error != nil { + continue + } + if opKey != pkgRevProposeDeletion && opKey != pkgRevDelete { + totalLifecycleDur += opMetric.Duration + } + } + } + + repoResults[repoName] = append(repoResults[repoName], pkgRevResult{ + pkgName: pkgName, + revision: revNum, + totalDur: totalLifecycleDur, + }) + } + } + + sort.Slice(repoResults[repoName], func(i, j int) bool { + if repoResults[repoName][i].pkgName != repoResults[repoName][j].pkgName { + return repoResults[repoName][i].pkgName < repoResults[repoName][j].pkgName + } + return repoResults[repoName][i].revision < repoResults[repoName][j].revision + }) + } + t.metricsMutex.RUnlock() + + maxRows := 0 + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + if len(repoResults[repoName]) > maxRows { + maxRows = len(repoResults[repoName]) + } + } + + for row := 0; row < maxRows; row++ { + record := []string{} + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + results := repoResults[repoName] + + if row < len(results) { + result := results[row] + pkgRev := fmt.Sprintf("%s:v%d", result.pkgName, result.revision) + duration := formatDuration(result.totalDur) + record = append(record, pkgRev, duration) + } else { + record = append(record, "", "0") + } + } + + if err := writer.Write(record); err != nil { + return err + } + } + + return nil +} + +func (t *PerfTestSuite) generateDetailedOperationsCSV() error { + csvFile, err := os.Create(t.csvOptions.operationsCSV) + if err != nil { + return pkgerrors.Wrapf(err, "failed to create operations CSV file %s", t.csvOptions.operationsCSV) + } + defer func() { _ = csvFile.Close() }() + + writer := csv.NewWriter(csvFile) + defer writer.Flush() + + header := []string{} + for i := 0; i < t.testOptions.numRepos; i++ { + repoSuffix := fmt.Sprintf("%0*d", len(fmt.Sprintf("%d", t.testOptions.numRepos)), i) + header = append(header, + fmt.Sprintf("REPO-%s-PKG:REV", repoSuffix), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevList), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevCreate), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevResourcesGet), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevUpdate), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevGet), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevPropose), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevGetProposed), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevPublished), + ) + } + + if err := writer.Write(header); err != nil { + return err + } + + type pkgRevOps struct { + pkgName string + revision int + ops map[string]time.Duration + } + + repoOps := make(map[string][]pkgRevOps) + + t.metricsMutex.RLock() + for repoName, testMetric := range t.metrics { + for pkgName, revisions := range testMetric.pkgRevMetrics { + for revNum, revMetrics := range revisions { + ops := make(map[string]time.Duration) + + for opKey, opMetric := range revMetrics.Metrics { + if opMetric.Error != nil { + continue + } + + if opKey != pkgRevProposeDeletion && opKey != pkgRevDelete { + ops[opKey] = opMetric.Duration + } + } + + repoOps[repoName] = append(repoOps[repoName], pkgRevOps{ + pkgName: pkgName, + revision: revNum, + ops: ops, + }) + } + } + + sort.Slice(repoOps[repoName], func(i, j int) bool { + if repoOps[repoName][i].pkgName != repoOps[repoName][j].pkgName { + return repoOps[repoName][i].pkgName < repoOps[repoName][j].pkgName + } + return repoOps[repoName][i].revision < repoOps[repoName][j].revision + }) + } + t.metricsMutex.RUnlock() + + maxRows := 0 + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + if len(repoOps[repoName]) > maxRows { + maxRows = len(repoOps[repoName]) + } + } + + for row := 0; row < maxRows; row++ { + record := []string{} + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + ops := repoOps[repoName] + + if row < len(ops) { + op := ops[row] + pkgRev := fmt.Sprintf("%s:v%d", op.pkgName, op.revision) + record = append(record, pkgRev) + + record = append(record, + formatDuration(op.ops[pkgRevList]), + formatDuration(op.ops[pkgRevCreate]), + formatDuration(op.ops[pkgRevResourcesGet]), + formatDuration(op.ops[pkgRevUpdate]), + formatDuration(op.ops[pkgRevGet]), + formatDuration(op.ops[pkgRevPropose]), + formatDuration(op.ops[pkgRevGetProposed]), + formatDuration(op.ops[pkgRevPublished]), + ) + } else { + record = append(record, "", "0", "0", "0", "0", "0", "0", "0", "0") + } + } + + if err := writer.Write(record); err != nil { + return err + } + } + + return nil +} + +func (t *PerfTestSuite) generateDeletionOperationsCSV() error { + csvFile, err := os.Create(t.csvOptions.deletionCSV) + if err != nil { + return pkgerrors.Wrapf(err, "failed to create deletion CSV file %s", t.csvOptions.deletionCSV) + } + defer func() { _ = csvFile.Close() }() + + writer := csv.NewWriter(csvFile) + defer writer.Flush() + + header := []string{} + for i := 0; i < t.testOptions.numRepos; i++ { + repoSuffix := fmt.Sprintf("%0*d", len(fmt.Sprintf("%d", t.testOptions.numRepos)), i) + header = append(header, + fmt.Sprintf("REPO-%s-PKG:REV", repoSuffix), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevProposeDeletion), + fmt.Sprintf("REPO-%s-%s", repoSuffix, pkgRevDelete), + ) + } + + if err := writer.Write(header); err != nil { + return err + } + + type pkgRevDelOps struct { + pkgName string + revision int + proposeDel time.Duration + deleteDur time.Duration + } + + repoDeletionOps := make(map[string][]pkgRevDelOps) + + t.metricsMutex.RLock() + for repoName, testMetric := range t.metrics { + for pkgName, revisions := range testMetric.pkgRevMetrics { + for revNum, revMetrics := range revisions { + var proposeDel, deleteDur time.Duration + + if proposeDelOp, ok := revMetrics.Metrics[pkgRevProposeDeletion]; ok && proposeDelOp.Error == nil { + proposeDel = proposeDelOp.Duration + } + if deleteOp, ok := revMetrics.Metrics[pkgRevDelete]; ok && deleteOp.Error == nil { + deleteDur = deleteOp.Duration + } + + if proposeDel > 0 || deleteDur > 0 { + repoDeletionOps[repoName] = append(repoDeletionOps[repoName], pkgRevDelOps{ + pkgName: pkgName, + revision: revNum, + proposeDel: proposeDel, + deleteDur: deleteDur, + }) + } + } + } + + sort.Slice(repoDeletionOps[repoName], func(i, j int) bool { + if repoDeletionOps[repoName][i].pkgName != repoDeletionOps[repoName][j].pkgName { + return repoDeletionOps[repoName][i].pkgName < repoDeletionOps[repoName][j].pkgName + } + return repoDeletionOps[repoName][i].revision < repoDeletionOps[repoName][j].revision + }) + } + t.metricsMutex.RUnlock() + + maxRows := 0 + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + if len(repoDeletionOps[repoName]) > maxRows { + maxRows = len(repoDeletionOps[repoName]) + } + } + + for row := 0; row < maxRows; row++ { + record := []string{} + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + ops := repoDeletionOps[repoName] + + if row < len(ops) { + op := ops[row] + pkgRev := fmt.Sprintf("%s:v%d", op.pkgName, op.revision) + proposeDelStr := formatDuration(op.proposeDel) + deleteStr := formatDuration(op.deleteDur) + record = append(record, pkgRev, proposeDelStr, deleteStr) + } else { + record = append(record, "", "0", "0") + } + } + + if err := writer.Write(record); err != nil { + return err + } + } + + return nil +} + +func formatDuration(d time.Duration) string { + if d == 0 { + return "0" + } + return fmt.Sprintf("%.3f", d.Seconds()) +} diff --git a/test/performance/gitea-secret.yaml b/test/performance/gitea-secret.yaml deleted file mode 100644 index 9a65f3713..000000000 --- a/test/performance/gitea-secret.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: gitea - namespace: porch-demo -type: kubernetes.io/basic-auth -stringData: - username: nephio - password: secret \ No newline at end of file diff --git a/test/performance/logger.go b/test/performance/logger.go index 661a3611d..37e945311 100644 --- a/test/performance/logger.go +++ b/test/performance/logger.go @@ -1,4 +1,4 @@ -// Copyright 2024, 2026 The Nephio Authors +// Copyright 2026 The Nephio Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -19,49 +19,193 @@ import ( "log" "os" "path/filepath" - "testing" + "sync" "time" + + pkgerrors "github.com/pkg/errors" ) +const timeDateFormat = "2006-01-02 15:04:05" + type TestLogger struct { - t *testing.T file *os.File logger *log.Logger + closed bool + mutex sync.Mutex } -func NewTestLogger(t *testing.T) (*TestLogger, error) { - // Creates logs directory if it doesn't exist +func (t *PerfTestSuite) NewTestLogger(prefix string) (*TestLogger, error) { logsDir := "logs" if err := os.MkdirAll(logsDir, 0755); err != nil { - return nil, fmt.Errorf("failed to create logs directory: %v", err) + return nil, pkgerrors.Wrapf(err, "failed to create logs directory %s", logsDir) } - // Create log file with timestamp - timestamp := time.Now().Format("2006-01-02-15-04-05") - filename := filepath.Join(logsDir, fmt.Sprintf("porch-metrics-%s.log", timestamp)) + timestamp := time.Now().Format(timeDateFormat) + filename := filepath.Join(logsDir, fmt.Sprintf("%s-%s.log", prefix, timestamp)) + + absPath, _ := filepath.Abs(filename) + fmt.Printf("Creating test log file: %s\n", absPath) + file, err := os.Create(filename) if err != nil { - return nil, fmt.Errorf("failed to create log file: %v", err) + return nil, pkgerrors.Wrapf(err, "failed to create log file %s", filename) } - logger := log.New(file, "", 0) // Remove timestamp from log entries + logger := log.New(file, "", 0) return &TestLogger{ - t: t, file: file, logger: logger, }, nil } +func (l *TestLogger) Sync() error { + l.mutex.Lock() + defer l.mutex.Unlock() + + if l.closed || l.file == nil { + return nil + } + return l.file.Sync() +} + func (l *TestLogger) Close() error { + l.mutex.Lock() + defer l.mutex.Unlock() + + if l.closed || l.file == nil { + return nil + } + + if err := l.file.Sync(); err != nil { + l.closed = true + _ = l.file.Close() + return err + } + + l.closed = true return l.file.Close() } func (l *TestLogger) LogResult(format string, args ...interface{}) { - if len(args) == 0 { - // If no args provided, treat format as plain string - l.logger.Println(format) - } else { - // Use as format string with args - l.logger.Printf(format, args...) + l.mutex.Lock() + defer l.mutex.Unlock() + + if l.closed || l.file == nil { + return + } + + l.logger.Printf(format, args...) + _ = l.file.Sync() +} + +type ResultsLogger struct { + resultsFile *os.File + logFile *os.File + mutex sync.Mutex + resultsFileClosed bool + logFileClosed bool +} + +func (t *PerfTestSuite) NewResultsLogger(resultsFileName, logFileName string) (*ResultsLogger, error) { + resultsFile, err := os.Create(resultsFileName) + if err != nil { + return nil, pkgerrors.Wrapf(err, "failed to create results file %s", resultsFileName) + } + + logFile, err := os.Create(logFileName) + if err != nil { + _ = resultsFile.Close() + return nil, pkgerrors.Wrapf(err, "failed to create log file %s", logFileName) + } + + return &ResultsLogger{ + resultsFile: resultsFile, + logFile: logFile, + }, nil +} + +func (l *ResultsLogger) LogApproved(repoName, pkgName string, revision int, prName string, duration time.Duration) { + l.mutex.Lock() + defer l.mutex.Unlock() + + if l.resultsFileClosed || l.resultsFile == nil { + return + } + + timestamp := time.Now().Format(timeDateFormat) + line := fmt.Sprintf("%s %s:%s:%d %s approved, took %.3f seconds\n", + timestamp, repoName, pkgName, revision, prName, duration.Seconds()) + _, _ = l.resultsFile.WriteString(line) + _ = l.resultsFile.Sync() +} + +func (l *ResultsLogger) LogDeleted(prName string, duration time.Duration) { + l.mutex.Lock() + defer l.mutex.Unlock() + + if l.resultsFileClosed || l.resultsFile == nil { + return + } + + timestamp := time.Now().Format(timeDateFormat) + line := fmt.Sprintf("%s %s deleted, took %.3f seconds\n", + timestamp, prName, duration.Seconds()) + _, _ = l.resultsFile.WriteString(line) + _ = l.resultsFile.Sync() +} + +func (l *ResultsLogger) LogToFile(format string, args ...interface{}) { + l.mutex.Lock() + defer l.mutex.Unlock() + + if l.logFileClosed || l.logFile == nil { + return + } + + _, _ = fmt.Fprintf(l.logFile, format+"\n", args...) + _ = l.logFile.Sync() +} + +func (l *ResultsLogger) Sync() error { + l.mutex.Lock() + defer l.mutex.Unlock() + + if !l.resultsFileClosed && l.resultsFile != nil { + if err := l.resultsFile.Sync(); err != nil { + return err + } + } + + if !l.logFileClosed && l.logFile != nil { + if err := l.logFile.Sync(); err != nil { + return err + } + } + + return nil +} + +func (l *ResultsLogger) Close() error { + l.mutex.Lock() + defer l.mutex.Unlock() + + var lastErr error + + if !l.resultsFileClosed && l.resultsFile != nil { + _ = l.resultsFile.Sync() + if err := l.resultsFile.Close(); err != nil { + lastErr = err + } + l.resultsFileClosed = true } + + if !l.logFileClosed && l.logFile != nil { + _ = l.logFile.Sync() + if err := l.logFile.Close(); err != nil { + lastErr = err + } + l.logFileClosed = true + } + + return lastErr } diff --git a/test/performance/metrics.go b/test/performance/metrics.go deleted file mode 100644 index 1d255d224..000000000 --- a/test/performance/metrics.go +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright 2024, 2026 The Nephio Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package metrics - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "net/http" - "testing" - "time" - - porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" - configapi "github.com/nephio-project/porch/api/porchconfig/v1alpha1" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -var ( - scheme = runtime.NewScheme() -) - -func init() { - utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(porchapi.AddToScheme(scheme)) - utilruntime.Must(configapi.AddToScheme(scheme)) -} - -func createGiteaRepo(repoName string) error { - giteaURL := "http://172.18.255.200:3000/api/v1/user/repos" - payload := map[string]interface{}{ - "name": repoName, - "description": "Test repository for Porch metrics", - "private": false, - "auto_init": true, - } - - jsonPayload, err := json.Marshal(payload) - if err != nil { - return fmt.Errorf("failed to marshal payload: %v", err) - } - - req, err := http.NewRequest("POST", giteaURL, bytes.NewBuffer(jsonPayload)) - if err != nil { - return fmt.Errorf("failed to create request: %v", err) - } - - req.Header.Set("Content-Type", "application/json") - req.SetBasicAuth("nephio", "secret") - - client := &http.Client{} - resp, err := client.Do(req) - if err != nil { - return fmt.Errorf("failed to create repo: %v", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusCreated { - return fmt.Errorf("failed to create repo, status: %d", resp.StatusCode) - } - - return nil -} - -func debugPackageStatus(t *testing.T, c client.Client, ctx context.Context, namespace, name string) { - var pkg porchapi.PackageRevision - err := c.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, &pkg) - if err != nil { - t.Logf("Error getting package: %v", err) - return - } - - t.Logf("\nPackage Status Details:") - t.Logf(" Name: %s", pkg.Name) - t.Logf(" LifecycleState: %s", pkg.Spec.Lifecycle) - t.Logf(" WorkspaceName: %s", pkg.Spec.WorkspaceName) - t.Logf(" Revision: %v", pkg.Spec.Revision) - t.Logf(" Published: %v", pkg.Status.PublishedAt) - t.Logf(" Tasks:") - for i, task := range pkg.Spec.Tasks { - t.Logf(" %d. Type: %s", i+1, task.Type) - if task.Type == porchapi.TaskTypeInit && task.Init != nil { - t.Logf(" Description: %s", task.Init.Description) - t.Logf(" Keywords: %v", task.Init.Keywords) - } - } - t.Logf(" Conditions:") - for _, cond := range pkg.Status.Conditions { - t.Logf(" - Type: %s", cond.Type) - t.Logf(" Status: %s", cond.Status) - t.Logf(" Message: %s", cond.Message) - t.Logf(" Reason: %s", cond.Reason) - } -} - -// ... existing imports and code ... - -func deleteGiteaRepo(repoName string) error { - giteaURL := fmt.Sprintf("http://172.18.255.200:3000/api/v1/repos/nephio/%s", repoName) - - req, err := http.NewRequest("DELETE", giteaURL, nil) - if err != nil { - return fmt.Errorf("failed to create delete request: %v", err) - } - - req.SetBasicAuth("nephio", "secret") - - client := &http.Client{} - resp, err := client.Do(req) - if err != nil { - return fmt.Errorf("failed to delete repo: %v", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusNotFound { - return fmt.Errorf("failed to delete repo, status: %d", resp.StatusCode) - } - - return nil -} -func waitForPorchRepository(ctx context.Context, c client.Client, t *testing.T, namespace, name string, timeout time.Duration) error { - start := time.Now() - for { - if time.Since(start) > timeout { - return fmt.Errorf("timeout waiting for repository to be ready") - } - - var repo configapi.Repository - err := c.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, &repo) - if err != nil { - return err - } - - t.Logf("\nRepository conditions at %v:", time.Since(start)) - t.Logf("Spec: %+v", repo.Spec) - t.Logf("Status: %+v", repo.Status) - - ready := false - for _, cond := range repo.Status.Conditions { - t.Logf(" - Type: %s, Status: %s, Message: %s", - cond.Type, cond.Status, cond.Message) - if cond.Type == "Ready" && cond.Status == "True" { - ready = true - break - } - } - - if ready { - return nil - } - - time.Sleep(2 * time.Second) - } -} diff --git a/test/performance/performance_suite.go b/test/performance/performance_suite.go new file mode 100644 index 000000000..18f9ad11c --- /dev/null +++ b/test/performance/performance_suite.go @@ -0,0 +1,930 @@ +// Copyright 2026 The Nephio Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "bytes" + "context" + "encoding/json" + "flag" + "fmt" + "net/http" + "os" + "os/signal" + "path/filepath" + "strings" + "sync" + "syscall" + "time" + + "github.com/joho/godotenv" + porchclient "github.com/nephio-project/porch/api/generated/clientset/versioned" + porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" + configapi "github.com/nephio-project/porch/api/porchconfig/v1alpha1" + metrics "github.com/nephio-project/porch/internal/metrics" + porchotel "github.com/nephio-project/porch/internal/otel" + pkgerrors "github.com/pkg/errors" + "github.com/stretchr/testify/suite" + coreapi "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" +) + +const ( + prometheusPort = 9095 + giteaUsername = "nephio" + giteaPassword = "secret" +) + +var ( + scheme = runtime.NewScheme() + + namespace = flag.String("namespace", "porch-metrics", "Kubernetes namespace to use for the test") + numRepos = flag.Int("repos", 1, "Number of repositories to create") + numPackages = flag.Int("packages", 5, "Number of packages per repository") + numRevisions = flag.Int("revisions", 5, "Number of package revisions per package") + repoParallelism = flag.Int("repo-parallelism", 1, "Number of repositories to create in parallel") + packageParallelism = flag.Int("package-parallelism", 1, "Number of packages to create in parallel per repository") + errorRate = flag.Float64("error-rate", 0.1, "Maximum percentage of package revisions allowed to fail lifecycle transition") + enableDeletion = flag.Bool("enable-deletion", false, "Enable deletion of package revisions at the end of the test") + enablePrometheus = flag.Bool("enable-prometheus", false, "Enable Prometheus metrics server on port 9091") + prrPaddingInMB = flag.Int("prr-padding-mb", 0, "Amount of padding data in MB to add to each PackageRevisionResource to increase its size and test performance with larger resources") + + metricsLogFile = flag.String("metrics-log-prefix", "porch-metrics", "Prefix for the timestamped metrics log file") + resultsFile = flag.String("results-file", "load_test_results.txt", "File name for test results") + fullLogFile = flag.String("detailed-log-file", "load_test.log", "File name for detailed log") + lifecycleCSV = flag.String("repo-results-csv", "load_test_lifecycle_results.csv", "File name for repository results CSV") + operationsCSV = flag.String("operations-csv", "load_test_operations_results.csv", "File name for operations details CSV") + deletionCSV = flag.String("deletion-csv", "load_test_deletion_results.csv", "File name for deletion operations CSV") + kptfilePath = flag.String("kptfile-path", "resources/Kptfile", "Path to the Kptfile") + packageResourcesPath = flag.String("package-resources-path", "resources/deployment.yaml", "Path to the package resources") + + retryBackoff = wait.Backoff{ + Duration: 50 * time.Millisecond, + Steps: 100, + Factor: 1.25, + Cap: 30 * time.Second, + } +) + +type PerfTestSuite struct { + suite.Suite + ctx context.Context + cancelCtx context.CancelFunc + client client.Client + clientSet porchclient.Interface + + testLogger *TestLogger + resultsLogger *ResultsLogger + otelResources *porchotel.OTelResources + enablePrometheus bool + + metrics map[string]TestMetrics + metricsMutex sync.RWMutex + + testOptions TestOptions + logOptions LogOptions + csvOptions CSVOptions +} + +type TestOptions struct { + namespace string + numRepos int + numPkgs int + numRevs int + repoParallelism int + packageParallelism int + errorRate float64 + enableDeletion bool + kptfilePath string + packageResourcesPath string + krmFnRegistryURL string + prrPaddingInMB int +} + +type LogOptions struct { + metricsLogFile string + resultsFile string + fullLogFile string +} + +type CSVOptions struct { + lifecycleCSV string + operationsCSV string + deletionCSV string +} + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(porchapi.AddToScheme(scheme)) + utilruntime.Must(configapi.AddToScheme(scheme)) +} + +func (t *PerfTestSuite) recordRepoMetric(repoName, opKey string, op OperationMetrics) { + t.metricsMutex.Lock() + defer t.metricsMutex.Unlock() + t.metrics[repoName].repoOps[opKey] = op +} + +func (t *PerfTestSuite) recordPkgRevMetric(repoName, pkgName string, revisionNum int, opKey string, op OperationMetrics) { + t.metricsMutex.Lock() + defer t.metricsMutex.Unlock() + t.metrics[repoName].pkgRevMetrics[pkgName][revisionNum].Metrics[opKey] = op +} + +func (t *PerfTestSuite) initPkgRevMetrics(repoName, pkgName string, revisionNum int) { + t.metricsMutex.Lock() + defer t.metricsMutex.Unlock() + t.metrics[repoName].pkgRevMetrics[pkgName][revisionNum] = PackageRevisionMetrics{ + pkgName: pkgName, + Revision: revisionNum, + Metrics: make(map[string]OperationMetrics), + } +} + +func getEnvWithDefault(key, defaultValue string) string { + _ = godotenv.Load(filepath.Join("..", "..", ".env")) + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} + +func (t *PerfTestSuite) SetupSuite() { + if os.Getenv("LOAD_TEST") != "1" && os.Getenv("MAX_PR_TEST=1") != "1" { + t.T().Skipf("Skipping performance tests in non-load test environment") + } + + flag.Parse() + + t.metrics = make(map[string]TestMetrics) + t.cancelOnSignal() + t.testOptions = TestOptions{ + namespace: *namespace, + numRepos: *numRepos, + numPkgs: *numPackages, + numRevs: *numRevisions, + repoParallelism: *repoParallelism, + packageParallelism: *packageParallelism, + errorRate: *errorRate, + enableDeletion: *enableDeletion, + kptfilePath: *kptfilePath, + packageResourcesPath: *packageResourcesPath, + krmFnRegistryURL: getEnvWithDefault("KRM_FN_REGISTRY_URL", "ghcr.io/kptdev/krm-functions-catalog"), + prrPaddingInMB: *prrPaddingInMB, + } + + t.logOptions = LogOptions{ + metricsLogFile: *metricsLogFile, + resultsFile: *resultsFile, + fullLogFile: *fullLogFile, + } + + t.csvOptions = CSVOptions{ + lifecycleCSV: *lifecycleCSV, + operationsCSV: *operationsCSV, + deletionCSV: *deletionCSV, + } + + logger, err := t.NewTestLogger(t.logOptions.metricsLogFile) + if err != nil { + t.T().Fatalf("Failed to create logger: %v", err) + } + + resultsLogger, err := t.NewResultsLogger(t.logOptions.resultsFile, t.logOptions.fullLogFile) + if err != nil { + t.T().Fatalf("Failed to create results logger: %v", err) + } + + cfg, err := config.GetConfig() + if err != nil { + t.T().Fatalf("Failed to get config: %v", err) + } + + c, err := client.New(cfg, client.Options{Scheme: scheme}) + if err != nil { + t.T().Fatalf("Failed to create client: %v", err) + } + + clientSet, err := porchclient.NewForConfig(cfg) + if err != nil { + t.T().Fatalf("Failed to create Porch clientset: %v", err) + } + + ctx, cancel := context.WithCancel(context.Background()) + t.ctx = ctx + t.cancelCtx = cancel + t.client = c + t.clientSet = clientSet + t.testLogger = logger + t.resultsLogger = resultsLogger + t.enablePrometheus = *enablePrometheus + + if t.enablePrometheus { + os.Setenv("OTEL_EXPORTER_PROMETHEUS_PORT", fmt.Sprintf("%d", prometheusPort)) + os.Setenv("OTEL_METRICS_EXPORTER", "prometheus") + os.Setenv("OTEL_TRACES_EXPORTER", "none") + var err error + t.otelResources, err = porchotel.SetupOpenTelemetry(ctx) + if err != nil { + t.T().Fatalf("Failed to setup OpenTelemetry: %v", err) + } + t.T().Logf("OTel metrics server started on port %v", prometheusPort) + metrics.PerfTestSetTestRunInfo("porch-performance-test", t.testOptions.namespace, time.Now()) + } + + t.T().Logf(" Running load test with:") + t.T().Logf(" Namespace: %s", t.testOptions.namespace) + t.T().Logf(" %d repositories", t.testOptions.numRepos) + t.T().Logf(" %d packages per repository", t.testOptions.numPkgs) + t.T().Logf(" %d revisions per package", t.testOptions.numRevs) + t.T().Logf(" Prometheus metrics: %v", t.enablePrometheus) + + if err = t.setupNamespaceAndSecret(); err != nil { + t.T().Fatalf("failed to setup namespace and secret: %v", err) + } + t.T().Logf("Created namespace %s and gitea secret", t.testOptions.namespace) + + t.T().Log("\n=== Cleaning up existing resources from previous runs ===") + if err = t.cleanupExistingResources(); err != nil { + t.T().Logf("Warning: Failed to cleanup existing resources: %v", err) + } + t.T().Log("Cleanup complete, ready to start test") +} + +func (t *PerfTestSuite) TearDownSuite() { + if t.cancelCtx != nil { + t.cancelCtx() + } + if t.otelResources != nil { + if err := t.otelResources.Flush(); err != nil { + t.T().Logf("Warning: Failed to flush metrics: %v", err) + } + t.T().Logf("Waiting 20 seconds before shutting down metrics server to ensure final scrapes complete...") + time.Sleep(20 * time.Second) + + if err := t.otelResources.ShutdownWithTimeout(5 * time.Second); err != nil { + t.T().Logf("Warning: Failed to shutdown OpenTelemetry: %v", err) + } + } + if t.testLogger != nil { + if err := t.testLogger.Close(); err != nil { + t.T().Logf("Warning: Failed to close test logger: %v", err) + } + } + if t.resultsLogger != nil { + if err := t.resultsLogger.Close(); err != nil { + t.T().Logf("Warning: Failed to close results logger: %v", err) + } + } +} + +func (t *PerfTestSuite) cancelOnSignal() { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) + go func() { + sig := <-sigChan + t.T().Logf("\nReceived signal %v, cancelling test context...", sig) + if t.cancelCtx != nil { + t.cancelCtx() + } + }() +} + +func (t *PerfTestSuite) setupNamespaceAndSecret() error { + ns := &coreapi.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: t.testOptions.namespace, + }, + } + + err := t.client.Create(t.ctx, ns) + if err != nil && !apierrors.IsAlreadyExists(err) { + return pkgerrors.Wrapf(err, "failed to create namespace %s", t.testOptions.namespace) + } + + secret := &coreapi.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "gitea", + Namespace: t.testOptions.namespace, + }, + Type: coreapi.SecretTypeBasicAuth, + StringData: map[string]string{ + "username": giteaUsername, + "password": giteaPassword, + }, + } + + err = t.client.Create(t.ctx, secret) + if err != nil && !apierrors.IsAlreadyExists(err) { + return pkgerrors.Wrapf(err, "failed to create gitea secret in namespace %s", t.testOptions.namespace) + } + + return nil +} + +func (t *PerfTestSuite) cleanupExistingResources() error { + var repoList configapi.RepositoryList + if err := t.client.List(t.ctx, &repoList, client.InNamespace(t.testOptions.namespace)); err != nil { + if !apierrors.IsNotFound(err) { + return pkgerrors.Wrap(err, "failed to list repositories") + } + } else { + for _, repo := range repoList.Items { + if err := t.client.Delete(t.ctx, &repo); err != nil { + if !apierrors.IsNotFound(err) { + t.T().Errorf("failed to delete Repository %s: %v", repo.Name, err) + } + } + } + if len(repoList.Items) > 0 { + t.T().Logf("deleted %d existing Repositories", len(repoList.Items)) + time.Sleep(5 * time.Second) + } + } + + deletedCount := 0 + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + if err := deleteGiteaRepo(repoName); err == nil { + deletedCount++ + } + } + if deletedCount > 0 { + t.T().Logf("deleted %d existing Gitea repositories", deletedCount) + } + + return nil +} + +func (t *PerfTestSuite) createAndSetupRepo(repoName string) { + t.metricsMutex.Lock() + t.metrics[repoName] = TestMetrics{ + RepoName: repoName, + repoOps: make(map[string]OperationMetrics), + pkgRevMetrics: make(map[string]map[int]PackageRevisionMetrics), + } + t.metricsMutex.Unlock() + + start := time.Now() + err := createGiteaRepo(repoName) + duration := time.Since(start) + + t.recordRepoMetric(repoName, giteaRepoCreate, OperationMetrics{ + Operation: fmt.Sprintf("%s:%s", giteaRepoCreate, repoName), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(giteaRepoCreate, repoName, "", duration, err) + } + + if err != nil { + t.T().Errorf("Failed to create Gitea repository: %v", err) + return + } + + start = time.Now() + repo := &configapi.Repository{ + ObjectMeta: metav1.ObjectMeta{ + Name: repoName, + Namespace: t.testOptions.namespace, + }, + Spec: configapi.RepositorySpec{ + Type: "git", + Git: &configapi.GitRepository{ + Repo: fmt.Sprintf("http://gitea.gitea.svc.cluster.local:3000/nephio/%s", repoName), + Branch: "main", + SecretRef: configapi.SecretRef{ + Name: "gitea", + }, + CreateBranch: true, + }, + }, + } + + err = t.client.Create(t.ctx, repo) + duration = time.Since(start) + + t.recordRepoMetric(repoName, porchRepoCreate, OperationMetrics{ + Operation: fmt.Sprintf("%s:%s", porchRepoCreate, repoName), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(porchRepoCreate, repoName, "", duration, err) + } + + if err != nil { + t.T().Errorf("Failed to create Porch repository: %v", err) + return + } + + if t.enablePrometheus { + metrics.PerfTestIncrementRepositoryCounter() + } + startWait := time.Now() + err = t.waitForRepository(repoName, 60*time.Second) + duration = time.Since(startWait) + + t.recordRepoMetric(repoName, repoWait, OperationMetrics{ + Operation: fmt.Sprintf("%s:%s", repoWait, repoName), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(repoWait, repoName, "", duration, err) + } +} + +func createGiteaRepo(repoName string) error { + giteaURL := "http://localhost:3000/api/v1/user/repos" + payload := map[string]interface{}{ + "name": repoName, + "description": "Test repository for Porch metrics", + "private": false, + "auto_init": true, + } + + jsonPayload, err := json.Marshal(payload) + if err != nil { + return pkgerrors.Wrap(err, "failed to marshal payload") + } + + req, err := http.NewRequest("POST", giteaURL, bytes.NewBuffer(jsonPayload)) + if err != nil { + return pkgerrors.Wrap(err, "failed to create request") + } + + req.Header.Set("Content-Type", "application/json") + req.SetBasicAuth(giteaUsername, giteaPassword) + + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return pkgerrors.Wrapf(err, "failed to create repo %s", repoName) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusCreated { + return pkgerrors.Errorf("failed to create repo, status: %d", resp.StatusCode) + } + + return nil +} + +func deleteGiteaRepo(repoName string) error { + giteaURL := fmt.Sprintf("http://localhost:3000/api/v1/repos/nephio/%s", repoName) + + req, err := http.NewRequest("DELETE", giteaURL, nil) + if err != nil { + return pkgerrors.Wrap(err, "failed to create delete request") + } + + req.SetBasicAuth(giteaUsername, giteaPassword) + + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return pkgerrors.Wrapf(err, "failed to delete repo %s", repoName) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusNotFound { + return pkgerrors.Errorf("failed to delete repo, status: %d", resp.StatusCode) + } + + return nil +} +func (t *PerfTestSuite) waitForRepository(name string, timeout time.Duration) error { + start := time.Now() + for { + if time.Since(start) > timeout { + return pkgerrors.Errorf("timeout waiting for repository to be ready") + } + + var repo configapi.Repository + err := t.client.Get(t.ctx, client.ObjectKey{Namespace: t.testOptions.namespace, Name: name}, &repo) + if err != nil { + return err + } + + t.T().Logf("\nRepository conditions at %v:", time.Since(start)) + t.T().Logf("Spec: %+v", repo.Spec) + t.T().Logf("Status: %+v", repo.Status) + + ready := false + for _, cond := range repo.Status.Conditions { + t.T().Logf(" - Type: %s, Status: %s, Message: %s", + cond.Type, cond.Status, cond.Message) + if cond.Type == "Ready" && cond.Status == "True" { + ready = true + break + } + } + + if ready { + return nil + } + + time.Sleep(2 * time.Second) + } +} + +func (t *PerfTestSuite) doLifecycle(repoName, pkgName string, revisionNum int) (string, error) { + var list porchapi.PackageRevisionList + var task []porchapi.Task + + t.initPkgRevMetrics(repoName, pkgName, revisionNum) + + start := time.Now() + err := retry.RetryOnConflict(retryBackoff, func() error { + return t.client.List(t.ctx, &list, client.InNamespace(t.testOptions.namespace)) + }) + duration := time.Since(start) + + t.recordPkgRevMetric(repoName, pkgName, revisionNum, pkgRevList, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevList, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevList, repoName, pkgName, duration, err) + } + + if err != nil { + return "", err + } + + var latestPR *porchapi.PackageRevision + for i := range list.Items { + pr := &list.Items[i] + if pr.Spec.PackageName == pkgName && + pr.Spec.RepositoryName == repoName && + pr.Spec.Lifecycle == porchapi.PackageRevisionLifecyclePublished { + if latestPR == nil || pr.Spec.Revision > latestPR.Spec.Revision { + latestPR = pr + } + } + } + + if revisionNum == 1 { + task = []porchapi.Task{ + { + Type: porchapi.TaskTypeInit, + Init: &porchapi.PackageInitTaskSpec{ + Description: fmt.Sprintf("Test package %s for Porch metrics", pkgName), + Keywords: []string{"test", "metrics"}, + Site: "https://nephio.org", + }, + }, + } + if t.enablePrometheus { + metrics.PerfTestIncrementPackageCounter() + } + } else if latestPR != nil { + task = []porchapi.Task{ + { + Type: porchapi.TaskTypeEdit, + Edit: &porchapi.PackageEditTaskSpec{ + Source: &porchapi.PackageRevisionRef{ + Name: latestPR.Name, + }, + }, + }, + } + } + + workspace := fmt.Sprintf("v%d", revisionNum) + pkgRev := &porchapi.PackageRevision{ + TypeMeta: metav1.TypeMeta{ + Kind: "PackageRevision", + APIVersion: porchapi.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: t.testOptions.namespace, + }, + Spec: porchapi.PackageRevisionSpec{ + PackageName: pkgName, + WorkspaceName: workspace, + RepositoryName: repoName, + Tasks: task, + }, + } + + if err = t.createPackageRevision(pkgRev, repoName, revisionNum); err != nil { + return "", err + } + + if err = t.updateOrCreatePackageRevisionResources(repoName, pkgName, pkgRev.Name, revisionNum); err != nil { + return "", err + } + + if err = t.proposeAndApprovePackage(repoName, pkgName, pkgRev.Name, revisionNum); err != nil { + return "", err + } + + return pkgRev.Name, nil +} + +func (t *PerfTestSuite) createPackageRevision(pkgRev *porchapi.PackageRevision, repoName string, revisionNum int) error { + start := time.Now() + if t.enablePrometheus { + metrics.PerfTestRecordActiveOperation(pkgRevCreate, 1) + defer metrics.PerfTestRecordActiveOperation(pkgRevCreate, -1) + } + + err := retry.RetryOnConflict(retryBackoff, func() error { + return t.client.Create(t.ctx, pkgRev) + }) + duration := time.Since(start) + + t.recordPkgRevMetric(repoName, pkgRev.Spec.PackageName, revisionNum, pkgRevCreate, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevCreate, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevCreate, repoName, pkgRev.Spec.PackageName, duration, err) + metrics.PerfTestRecordPackageRevision(pkgRevCreate, err) + } + + if err != nil { + return err + } + + return nil +} + +func (t *PerfTestSuite) updateOrCreatePackageRevisionResources(repoName, pkgName, pkgRevName string, revisionNum int) error { + var resources porchapi.PackageRevisionResources + + start := time.Now() + err := retry.RetryOnConflict(retryBackoff, func() error { + return t.client.Get(t.ctx, client.ObjectKey{Namespace: t.testOptions.namespace, Name: pkgRevName}, &resources) + }) + duration := time.Since(start) + t.recordPkgRevMetric(repoName, pkgName, revisionNum, pkgRevResourcesGet, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevResourcesGet, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevResourcesGet, repoName, pkgName, duration, err) + } + + if err != nil { + return err + } + + pkgResources := t.createPackageResources(pkgRevName) + if resources.Spec.Resources == nil { + resources.Spec.Resources = make(map[string]string) + } + for name, content := range pkgResources { + resources.Spec.Resources[name] = content + } + + start = time.Now() + err = retry.RetryOnConflict(retryBackoff, func() error { + return t.client.Update(t.ctx, &resources) + }) + duration = time.Since(start) + t.recordPkgRevMetric(repoName, pkgName, revisionNum, pkgRevUpdate, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevUpdate, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevUpdate, repoName, pkgName, duration, err) + } + + if err != nil { + return err + } + + return nil +} + +func (t *PerfTestSuite) createPackageResources(pkgName string) map[string]string { + resources := make(map[string]string) + + testDataSize := t.testOptions.prrPaddingInMB * 1024 * 1024 + + resources["Kptfile"] = t.readResourcesFromDir(t.testOptions.kptfilePath) + resources["deployment.yaml"] = t.readResourcesFromDir(t.testOptions.packageResourcesPath) + + resources["Kptfile"] = strings.ReplaceAll(resources["Kptfile"], "CHANGE_ME", pkgName) + resources["Kptfile"] = strings.ReplaceAll(resources["Kptfile"], "REGISTRY_URL", t.testOptions.krmFnRegistryURL) + resources["deployment.yaml"] = strings.ReplaceAll(resources["deployment.yaml"], "CHANGE_ME", pkgName) + strings.Repeat("a", testDataSize) + + return resources +} + +func (t *PerfTestSuite) proposeAndApprovePackage(repoName, pkgName, pkgRevName string, revisionNum int) error { + var pkg porchapi.PackageRevision + + start := time.Now() + err := retry.RetryOnConflict(retryBackoff, func() error { + return t.client.Get(t.ctx, client.ObjectKey{Namespace: t.testOptions.namespace, Name: pkgRevName}, &pkg) + }) + duration := time.Since(start) + t.recordPkgRevMetric(repoName, pkgName, revisionNum, pkgRevGet, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevGet, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevGet, repoName, pkgName, duration, err) + } + + if err != nil { + return err + } + + start = time.Now() + initialLifecycle := pkg.Spec.Lifecycle + err = retry.RetryOnConflict(retryBackoff, func() error { + if err := t.client.Get(t.ctx, client.ObjectKey{Namespace: t.testOptions.namespace, Name: pkgRevName}, &pkg); err != nil { + return err + } + pkg.Spec.Lifecycle = porchapi.PackageRevisionLifecycleProposed + return t.client.Update(t.ctx, &pkg) + }) + duration = time.Since(start) + + t.recordPkgRevMetric(repoName, pkgName, revisionNum, pkgRevPropose, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevPropose, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevPropose, repoName, pkgName, duration, err) + metrics.PerfTestRecordLifecycleTransition(string(initialLifecycle), string(porchapi.PackageRevisionLifecycleProposed), repoName, pkgName, duration, err) + } + + if err != nil { + return err + } + + start = time.Now() + err = retry.RetryOnConflict(retryBackoff, func() error { + return t.client.Get(t.ctx, client.ObjectKey{Namespace: t.testOptions.namespace, Name: pkgRevName}, &pkg) + }) + duration = time.Since(start) + t.recordPkgRevMetric(repoName, pkgName, revisionNum, pkgRevGetProposed, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevGetProposed, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevGetProposed, repoName, pkgName, duration, err) + } + + if err != nil { + return err + } + + start = time.Now() + err = retry.RetryOnConflict(retryBackoff, func() error { + if err := t.client.Get(t.ctx, client.ObjectKey{Namespace: t.testOptions.namespace, Name: pkgRevName}, &pkg); err != nil { + return err + } + pkg.Spec.Lifecycle = porchapi.PackageRevisionLifecyclePublished + _, err := t.clientSet.PorchV1alpha1().PackageRevisions(t.testOptions.namespace).UpdateApproval(t.ctx, pkgRevName, &pkg, metav1.UpdateOptions{}) + return err + }) + duration = time.Since(start) + + t.recordPkgRevMetric(repoName, pkgName, revisionNum, pkgRevPublished, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevPublished, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevPublished, repoName, pkgName, duration, err) + metrics.PerfTestRecordLifecycleTransition(string(porchapi.PackageRevisionLifecycleProposed), string(porchapi.PackageRevisionLifecyclePublished), repoName, pkgName, duration, err) + } + + return nil +} + +func (t *PerfTestSuite) deletePackageRevision(repoName, pkgName, pkgRevName string, revisionNum int) error { + var pkgRev porchapi.PackageRevision + err := retry.RetryOnConflict(retryBackoff, func() error { + return t.client.Get(t.ctx, client.ObjectKey{Namespace: t.testOptions.namespace, Name: pkgRevName}, &pkgRev) + }) + if err != nil { + return err + } + + start := time.Now() + initialLifecycle := pkgRev.Spec.Lifecycle + err = retry.RetryOnConflict(retryBackoff, func() error { + if err := t.client.Get(t.ctx, client.ObjectKey{Namespace: t.testOptions.namespace, Name: pkgRevName}, &pkgRev); err != nil { + return err + } + pkgRev.Spec.Lifecycle = porchapi.PackageRevisionLifecycleDeletionProposed + return t.client.Update(t.ctx, &pkgRev) + }) + duration := time.Since(start) + t.recordPkgRevMetric(repoName, pkgName, revisionNum, pkgRevProposeDeletion, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevProposeDeletion, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevProposeDeletion, repoName, pkgName, duration, err) + metrics.PerfTestRecordLifecycleTransition(string(initialLifecycle), string(porchapi.PackageRevisionLifecycleDeletionProposed), repoName, pkgName, duration, err) + } + + if err != nil { + return err + } + + start = time.Now() + err = retry.RetryOnConflict(retryBackoff, func() error { + if err := t.client.Get(t.ctx, client.ObjectKey{Namespace: t.testOptions.namespace, Name: pkgRevName}, &pkgRev); err != nil { + return err + } + return t.client.Delete(t.ctx, &pkgRev) + }) + duration = time.Since(start) + t.recordPkgRevMetric(repoName, pkgName, revisionNum, pkgRevDelete, OperationMetrics{ + Operation: fmt.Sprintf("%s:%d", pkgRevDelete, revisionNum), + Duration: duration, + Error: err, + Timestamp: start, + }) + + if t.enablePrometheus { + metrics.PerfTestRecordMetric(pkgRevDelete, repoName, pkgName, duration, err) + metrics.PerfTestRecordLifecycleTransition(string(porchapi.PackageRevisionLifecycleDeletionProposed), "deleted", repoName, pkgName, duration, err) + } + + return nil +} + +func (t *PerfTestSuite) readResourcesFromDir(dir string) string { + t.T().Helper() + var content []byte + err := filepath.WalkDir(dir, func(path string, d os.DirEntry, err error) error { + if err != nil { + return err + } + if !d.IsDir() { + content, err = os.ReadFile(path) + if err != nil { + t.T().Fatalf("ReadFile(%q) failed: %v", path, err) + } + } + return nil + }) + if err != nil { + t.T().Fatalf("WalkDir(%s) failed: %v", dir, err) + } + return string(content) +} diff --git a/test/performance/porch_metrics_test.go b/test/performance/porch_metrics_test.go index 7e96c3ca0..b0d160d8c 100644 --- a/test/performance/porch_metrics_test.go +++ b/test/performance/porch_metrics_test.go @@ -1,4 +1,4 @@ -// Copyright 2024, 2026 The Nephio Authors +// Copyright 2026 The Nephio Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,540 +14,539 @@ package metrics import ( - "bytes" - "context" - "flag" "fmt" - "net/http" + "math" "os" - "os/exec" - "os/signal" "path/filepath" - "runtime" - "syscall" + "strings" + "sync" "testing" "time" porchapi "github.com/nephio-project/porch/api/porch/v1alpha1" - configapi "github.com/nephio-project/porch/api/porchconfig/v1alpha1" - "github.com/prometheus/client_golang/prometheus/promhttp" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "github.com/stretchr/testify/suite" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/config" ) -var ( - numRepos = flag.Int("repos", 1, "Number of repositories to create") - numPackages = flag.Int("packages", 5, "Number of packages per repository") -) - -func createAndSetupRepo(t *testing.T, ctx context.Context, c client.Client, namespace, repoName string) []OperationMetrics { - var metrics []OperationMetrics - start := time.Now() - - // Create Gitea repo - err := createGiteaRepo(repoName) - duration := time.Since(start).Seconds() - recordMetric("Create Gitea Repository", repoName, "", duration, err) - metrics = append(metrics, OperationMetrics{ - Operation: "Create Gitea Repository", - Duration: time.Duration(duration * float64(time.Second)), - Error: err, - }) - - if err != nil { - t.Logf("Warning: Failed to create Gitea repository: %v", err) - return metrics - } +type PerformanceTests struct { + PerfTestSuite +} - // Create Porch repo - start = time.Now() - repo := &configapi.Repository{ - ObjectMeta: metav1.ObjectMeta{ - Name: repoName, - Namespace: namespace, - }, - Spec: configapi.RepositorySpec{ - Type: "git", - Git: &configapi.GitRepository{ - Repo: fmt.Sprintf("http://172.18.255.200:3000/nephio/%s", repoName), - Branch: "main", - SecretRef: configapi.SecretRef{ - Name: "gitea", - }, - CreateBranch: true, - }, - }, +func TestPerf(t *testing.T) { + kubeconfig := os.Getenv("KUBECONFIG") + if kubeconfig == "" { + home, err := os.UserHomeDir() + if err == nil { + kubeconfig = filepath.Join(home, ".kube", "config") + } } - err = c.Create(ctx, repo) - duration = time.Since(start).Seconds() - recordMetric("Create Porch Repository", repoName, "", duration, err) - metrics = append(metrics, OperationMetrics{ - Operation: "Create Porch Repository", - Duration: time.Duration(duration * float64(time.Second)), - Error: err, - }) - - if err == nil { - repositoryCounter.Inc() - start = time.Now() - err = waitForPorchRepository(ctx, c, t, namespace, repoName, 60*time.Second) - duration = time.Since(start).Seconds() - recordMetric("Wait Repository Ready", repoName, "", duration, err) - metrics = append(metrics, OperationMetrics{ - Operation: "Wait for Porch Repository Ready", - Duration: time.Duration(duration * float64(time.Second)), - Error: err, - }) + if _, err := os.Stat(kubeconfig); err == nil { + _ = os.Setenv("KUBERNETES_MASTER", "http://localhost:8080") } - return metrics + suite.Run(t, &PerformanceTests{}) } -func createAndTestPackage(t *testing.T, ctx context.Context, c client.Client, namespace, repoName, pkgName string) []OperationMetrics { - var metrics []OperationMetrics - start := time.Now() - - // Create new package - newPkg := &porchapi.PackageRevision{ - TypeMeta: metav1.TypeMeta{ - Kind: "PackageRevision", - APIVersion: porchapi.SchemeGroupVersion.String(), - }, - ObjectMeta: metav1.ObjectMeta{ - GenerateName: fmt.Sprintf("%s-", repoName), - Namespace: namespace, - }, - Spec: porchapi.PackageRevisionSpec{ - PackageName: pkgName, - WorkspaceName: "main", - RepositoryName: repoName, - Lifecycle: porchapi.PackageRevisionLifecycleDraft, - Tasks: []porchapi.Task{ - { - Type: porchapi.TaskTypeInit, - Init: &porchapi.PackageInitTaskSpec{ - Description: "Test package for Porch metrics", - Keywords: []string{"test", "metrics"}, - Site: "https://nephio.org", - }, - }, - }, - }, +func (t *PerformanceTests) TestPorchScalePerformance() { + if os.Getenv("LOAD_TEST") != "1" { + t.T().Skipf("LOAD_TEST != 1: Skipping performance tests in non-load test environment") } - err := c.Create(ctx, newPkg) - duration := time.Since(start).Seconds() - recordMetric("Create PackageRevision", repoName, pkgName, duration, err) - metrics = append(metrics, OperationMetrics{ - Operation: "Create PackageRevision", - Duration: time.Duration(duration * float64(time.Second)), - Error: err, - }) - - if err == nil { - packageCounter.Inc() + // We never use error calculation in scale performance test + errorCalculator := func(err error, errCount, numRevs int) bool { + return false } - // Wait for package to initialize - time.Sleep(5 * time.Second) - debugPackageStatus(t, c, ctx, namespace, newPkg.Name) - - // First get the package - var pkg porchapi.PackageRevision - err = c.Get(ctx, client.ObjectKey{Namespace: namespace, Name: newPkg.Name}, &pkg) - if err == nil { - // Start timing only the update operation - start = time.Now() - pkg.Spec.Lifecycle = porchapi.PackageRevisionLifecycleProposed - err = c.Update(ctx, &pkg) - duration = time.Since(start).Seconds() - recordMetric("Update to Proposed", repoName, pkgName, duration, err) - metrics = append(metrics, OperationMetrics{ - Operation: "Update to Proposed", - Duration: time.Duration(duration * float64(time.Second)), - Error: err, - }) + testStartTime := time.Now() - if err == nil { - // Wait for proposed state to settle - time.Sleep(5 * time.Second) - debugPackageStatus(t, c, ctx, namespace, pkg.Name) - - // Publish the package with approval - start = time.Now() - pkg.Spec.Lifecycle = porchapi.PackageRevisionLifecyclePublished - err = c.SubResource("approval").Update(ctx, &pkg) - duration = time.Since(start).Seconds() - recordMetric("Update to Published", repoName, pkgName, duration, err) - metrics = append(metrics, OperationMetrics{ - Operation: "Update to Published", - Duration: time.Duration(duration * float64(time.Second)), - Error: err, - }) - - if err == nil { - // Verify final state - time.Sleep(5 * time.Second) - debugPackageStatus(t, c, ctx, namespace, pkg.Name) - } + repoSemaphore := make(chan struct{}, t.testOptions.repoParallelism) + var repoWg sync.WaitGroup + + for i := 0; i < t.testOptions.numRepos; i++ { + select { + case <-t.ctx.Done(): + t.T().Log("Test cancelled, stopping repository creation") + goto waitDone + default: } + repoSemaphore <- struct{}{} + repoWg.Add(1) + go func(repoIndex int) { + defer repoWg.Done() + defer func() { <-repoSemaphore }() + t.processRepository(repoIndex, t.testOptions.numRevs, errorCalculator) + }(i) } +waitDone: + repoWg.Wait() + lifecycleDuration := time.Since(testStartTime) - // Delete package - err = c.Get(ctx, client.ObjectKey{Namespace: namespace, Name: pkg.Name}, &pkg) - if err == nil { - start = time.Now() - pkg.Spec.Lifecycle = porchapi.PackageRevisionLifecycleDeletionProposed - err = c.SubResource("approval").Update(ctx, &pkg) - if err == nil { - time.Sleep(2 * time.Second) - err = c.Delete(ctx, &pkg) - } - duration = time.Since(start).Seconds() - recordMetric("Delete PackageRevision", repoName, pkgName, duration, err) - metrics = append(metrics, OperationMetrics{ - Operation: "Delete PackageRevision", - Duration: time.Duration(duration * float64(time.Second)), - Error: err, - }) + var deletionStartTime time.Time + var deletionDuration time.Duration + var deletedCount int + + if t.testOptions.enableDeletion { + t.deleteEnv(&deletionStartTime, &deletionDuration, &deletedCount) } - return metrics -} + t.printTestResults(t.testLogger) -func setupMonitoring(t *testing.T) error { - // Create prometheus.yml - promConfig := ` -global: - scrape_interval: 1s - evaluation_interval: 1s - -scrape_configs: - - job_name: 'porch_metrics' - static_configs: - - targets: ['host.docker.internal:2113'] - scrape_interval: 1s -` - if err := os.WriteFile("prometheus.yml", []byte(promConfig), 0644); err != nil { - return fmt.Errorf("failed to create prometheus.yml: %v", err) + if t.testOptions.enableDeletion { + t.T().Logf("Total duration for deletion operations: %v", deletionDuration) + t.resultsLogger.LogToFile("Total duration for deletion operations: %v", deletionDuration) } + t.logResults(lifecycleDuration, &deletedCount) +} - // Execute Docker commands - cmds := []struct { - name string - cmd string - args []string - }{ - {"network create", "docker", []string{"network", "create", "monitoring"}}, - {"stop prometheus", "docker", []string{"stop", "prometheus"}}, - {"remove prometheus", "docker", []string{"rm", "prometheus"}}, - {"run prometheus", "docker", []string{ - "run", "-d", - "--name", "prometheus", - "--network", "monitoring", - "--add-host", "host.docker.internal:host-gateway", - "-p", "9090:9090", - "-v", fmt.Sprintf("%s/prometheus.yml:/etc/prometheus/prometheus.yml", getCurrentDir()), - "prom/prometheus", - }}, +func (t *PerformanceTests) TestIncreasePRsPerformance() { + maxPkgRevNum := math.MaxInt + if os.Getenv("MAX_PR_TEST") != "1" { + t.T().Skipf("MAX_PR_TEST != 1: Skipping performance tests in non-load test environment") } - for _, cmd := range cmds { - if err := exec.Command(cmd.cmd, cmd.args...).Run(); err != nil { - t.Logf("Warning executing %s: %v", cmd.name, err) + // TODO: Making more complex error calculation logic + errorCalculator := func(err error, errCount, numRevs int) bool { + if err != nil { + t.T().Logf("\n--- Error Rate: %f", float64(errCount)/float64(numRevs)) + return float64(errCount)/float64(numRevs) >= t.testOptions.errorRate } + return false } - // Give Prometheus a moment to start up - time.Sleep(2 * time.Second) + testStartTime := time.Now() - return nil -} + repoIndex := 0 -func getCurrentDir() string { - dir, err := os.Getwd() - if err != nil { - return "." - } - return dir -} + t.processRepository(repoIndex, maxPkgRevNum, errorCalculator) -func cleanup(t *testing.T) { - cmds := []struct { - cmd string - args []string - }{ - {"docker", []string{"stop", "prometheus"}}, - {"docker", []string{"rm", "prometheus"}}, - {"docker", []string{"network", "rm", "monitoring"}}, - } + lifecycleDuration := time.Since(testStartTime) - for _, cmd := range cmds { - if err := exec.Command(cmd.cmd, cmd.args...).Run(); err != nil { - t.Logf("Warning during cleanup: %v", err) - } - } - - os.Remove("prometheus.yml") -} + var deletionStartTime time.Time + var deletionDuration time.Duration + var deletedCount int -func setupGiteaSecret(t *testing.T) error { - // Get the current file's directory - _, filename, _, ok := runtime.Caller(0) - if !ok { - return fmt.Errorf("failed to get current file path") + if t.testOptions.enableDeletion { + t.deleteEnv(&deletionStartTime, &deletionDuration, &deletedCount) } - dir := filepath.Dir(filename) - // Read and apply the secret manifest - secretManifest, err := os.ReadFile(filepath.Join(dir, "gitea-secret.yaml")) - if err != nil { - return fmt.Errorf("failed to read gitea secret manifest: %v", err) + t.printTestResults(t.testLogger) + if t.testOptions.enableDeletion { + t.T().Logf("Total duration for deletion operations: %v", deletionDuration) + t.resultsLogger.LogToFile("Total duration for deletion operations: %v", deletionDuration) } + t.logResults(lifecycleDuration, &deletedCount) +} - cmd := exec.Command("kubectl", "apply", "-f", "-") - cmd.Stdin = bytes.NewReader(secretManifest) - if output, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("failed to apply gitea secret: %v\nOutput: %s", err, output) - } +func (t *PerformanceTests) deleteEnv(deletionStartTime *time.Time, deletionDuration *time.Duration, deletedCount *int) { - return nil -} + *deletionStartTime = time.Now() -func TestPorchScalePerformance(t *testing.T) { - // Skip if not running E2E tests - if os.Getenv("E2E") == "" { - t.Skip("Skipping performance tests in non-E2E environment") - } + t.T().Log("\n=== Starting Deletion Operations ===") + t.T().Logf("Deletion enabled: will delete all %d package revisions across %d repositories", t.testOptions.numRepos*t.testOptions.numPkgs*t.testOptions.numRevs, t.testOptions.numRepos) - // Check if docker is available - if _, err := exec.LookPath("docker"); err != nil { - t.Skip("Docker not available, skipping performance tests") - } + var prList porchapi.PackageRevisionList + if err := t.client.List(t.ctx, &prList, client.InNamespace(t.testOptions.namespace)); err != nil { + t.T().Logf("failed to list package revisions for deletion: %v", err) + } else { + t.T().Logf("found %d package revisions to delete", len(prList.Items)) - // Setup Gitea secret - if err := setupGiteaSecret(t); err != nil { - t.Fatalf("Failed to setup Gitea secret: %v", err) - } + *deletedCount = 0 + for _, pr := range prList.Items { + prefix := fmt.Sprintf("%s-test-", t.testOptions.namespace) + if !strings.HasPrefix(pr.Spec.RepositoryName, prefix) { + continue + } - // Setup monitoring - if err := setupMonitoring(t); err != nil { - t.Fatalf("Failed to setup monitoring: %v", err) - } - defer cleanup(t) + revisionNum := 1 + if strings.Contains(pr.Spec.WorkspaceName, "v") { + _, _ = fmt.Sscanf(pr.Spec.WorkspaceName, "v%d", &revisionNum) + } - // Create a channel to handle interrupt signal - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) + t.T().Logf("Deleting package revision: %s (repo: %s, pkg: %s, revision: %d)", + pr.Name, pr.Spec.RepositoryName, pr.Spec.PackageName, revisionNum) - // Start metrics server - go func() { - http.Handle("/metrics", promhttp.Handler()) - if err := http.ListenAndServe(":2113", nil); err != nil { - t.Logf("Error starting metrics server: %v", err) + if err = t.deletePackageRevision(pr.Spec.RepositoryName, pr.Spec.PackageName, pr.Name, revisionNum); err == nil { + proposeDel := t.metrics[pr.Spec.RepositoryName].pkgRevMetrics[pr.Spec.PackageName][revisionNum].Metrics[pkgRevProposeDeletion] + del := t.metrics[pr.Spec.RepositoryName].pkgRevMetrics[pr.Spec.PackageName][revisionNum].Metrics[pkgRevDelete] + t.resultsLogger.LogDeleted(pr.Name, proposeDel.Duration+del.Duration) + *deletedCount++ + } else { + t.T().Errorf("failed to delete package revision: %s (repo: %s, pkg: %s, revision: %d)", pr.Name, pr.Spec.RepositoryName, pr.Spec.PackageName, revisionNum) + } } - }() + *deletionDuration = time.Since(*deletionStartTime) - // Setup logger - logger, err := NewTestLogger(t) - if err != nil { - t.Fatalf("Failed to create logger: %v", err) + t.T().Logf("Completed deletion of %d package revisions", deletedCount) } - defer logger.Close() +} - flag.Parse() +func (t *PerformanceTests) printTestResults(logger *TestLogger) { + header := "\n=== Consolidated Performance Test Results ===" + t.T().Log(header) + logger.LogResult("%s", header) + + subheader := "Operation Min Max Avg Total" + t.T().Log(subheader) + logger.LogResult("%s", subheader) - t.Logf("\nRunning test with %d repositories and %d packages per repository", *numRepos, *numPackages) + divider := "------------------------------------------------------------------------------------" + t.T().Log(divider) + logger.LogResult("%s", divider) - // Setup clients - cfg, err := config.GetConfig() - if err != nil { - t.Fatalf("Failed to get config: %v", err) + operationStats := make(map[string]*Stats) + + operationHeadings := map[string]string{ + giteaRepoCreate: "Create Gitea Repository ", + porchRepoCreate: "Create Porch Repository ", + repoWait: "Repository Ready Wait", + pkgRevList: "Package Revision List", + pkgRevCreate: "Package Revision Create", + pkgRevResourcesGet: "Package Revision Get Resources", + pkgRevUpdate: "Package Revision Update", + pkgRevGet: "Package Revision Get", + pkgRevPropose: "Package Revision Propose", + pkgRevGetProposed: "Package Revision Get (Proposed)", + pkgRevPublished: "Package Revision Approve/Publish", + pkgRevProposeDeletion: "Package Revision Propose Deletion", + pkgRevDelete: "Package Revision Delete", } - c, err := client.New(cfg, client.Options{Scheme: scheme}) - if err != nil { - t.Fatalf("Failed to create client: %v", err) + repoOperations := []string{ + giteaRepoCreate, + porchRepoCreate, + repoWait, } - ctx := context.Background() - namespace := "porch-demo" - var allMetrics []TestMetrics - - // Test multiple repositories - for i := 0; i < *numRepos; i++ { - repoName := fmt.Sprintf("porch-metrics-test-%d", i) - t.Logf("\n=== Testing Repository %d: %s ===", i+1, repoName) - - // Cleanup any existing resources first - _ = deleteGiteaRepo(repoName) - _ = c.Delete(ctx, &configapi.Repository{ - ObjectMeta: metav1.ObjectMeta{ - Name: repoName, - Namespace: namespace, - }, - }) - time.Sleep(5 * time.Second) // Wait for cleanup - - repoMetrics := createAndSetupRepo(t, ctx, c, namespace, repoName) - for _, m := range repoMetrics { - recordMetric(m.Operation, repoName, "", m.Duration.Seconds(), m.Error) - } - printIterationResults(t, logger, i*(*numPackages), repoMetrics) + pkgRevOperations := []string{ + pkgRevList, + pkgRevCreate, + pkgRevResourcesGet, + pkgRevUpdate, + pkgRevGet, + pkgRevPropose, + pkgRevGetProposed, + pkgRevPublished, + pkgRevProposeDeletion, + pkgRevDelete, + } - // Test multiple packages per repository - for j := 0; j < *numPackages; j++ { - pkgName := fmt.Sprintf("test-package-%d", j) - t.Logf("\n--- Testing Package %d: %s ---", j+1, pkgName) + allOperations := append(repoOperations, pkgRevOperations...) - pkgMetrics := createAndTestPackage(t, ctx, c, namespace, repoName, pkgName) - for _, m := range pkgMetrics { - recordMetric(m.Operation, repoName, pkgName, m.Duration.Seconds(), m.Error) - } - printIterationResults(t, logger, (i*(*numPackages))+j+1, pkgMetrics) + for _, op := range allOperations { + operationStats[op] = &Stats{} + } - allMetrics = append(allMetrics, TestMetrics{ - RepoName: repoName, - PkgName: pkgName, - Metrics: append(repoMetrics, pkgMetrics...), - }) + t.metricsMutex.RLock() + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + repoMetrics, exists := t.metrics[repoName] + if !exists { + continue } - // Cleanup repository - start := time.Now() - err := c.Delete(ctx, &configapi.Repository{ - ObjectMeta: metav1.ObjectMeta{ - Name: repoName, - Namespace: namespace, - }, - }) - cleanupMetrics := []OperationMetrics{{ - Operation: "Delete Repository", - Duration: time.Since(start), - Error: err, - }} - printIterationResults(t, logger, (i+1)*(*numPackages), cleanupMetrics) - } - - // Print consolidated results - printTestResults(t, logger, allMetrics) + for key, repoOp := range repoMetrics.repoOps { + if repoOp.Error != nil { + continue + } + if stats, ok := operationStats[key]; ok { + if stats.Count == 0 || repoOp.Duration < stats.Min { + stats.Min = repoOp.Duration + } + if repoOp.Duration > stats.Max { + stats.Max = repoOp.Duration + } + stats.Total += repoOp.Duration + stats.Count++ + } + } - // After all tests complete, print message and wait for interrupt - t.Log("\nTests completed. Prometheus server is running at http://localhost:9090") - t.Log("Press Ctrl+C to stop and cleanup...") + for j := 0; j < t.testOptions.numPkgs; j++ { + pkgName := fmt.Sprintf("network-function-%d", j) + pkgRevisions, exists := repoMetrics.pkgRevMetrics[pkgName] + if !exists { + continue + } - // Wait for interrupt signal - <-sigChan - t.Log("\nReceived interrupt signal. Cleaning up...") -} + for k := 1; k <= t.testOptions.numRevs; k++ { + pkgRevMetric, exists := pkgRevisions[k] + if !exists { + continue + } -func printIterationResults(t *testing.T, logger *TestLogger, iteration int, metrics []OperationMetrics) { - // Console output - t.Logf("\n=== Iteration %d Results ===", iteration) - t.Log("Operation Duration Status") - t.Log("--------------------------------------------------") - - // File output - logger.LogResult("\n=== Iteration %d Results ===", iteration) - logger.LogResult("Operation Duration Status") - logger.LogResult("--------------------------------------------------") - - for _, m := range metrics { - status := "Success" - if m.Error != nil { - status = "Failed: " + m.Error.Error() + for opKey, opMetric := range pkgRevMetric.Metrics { + if opMetric.Error != nil { + continue + } + if stats, ok := operationStats[opKey]; ok { + if stats.Count == 0 || opMetric.Duration < stats.Min { + stats.Min = opMetric.Duration + } + if opMetric.Duration > stats.Max { + stats.Max = opMetric.Duration + } + stats.Total += opMetric.Duration + stats.Count++ + } + } + } } - result := fmt.Sprintf("%-25s %-10v %s", - m.Operation, - m.Duration.Round(time.Millisecond), - status) - - t.Log(result) - logger.LogResult("%s", result) } -} + t.metricsMutex.RUnlock() + + t.metricsMutex.RLock() + for i := 0; i < t.testOptions.numRepos; i++ { + for _, opKey := range repoOperations { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + repoMetrics, exists := t.metrics[repoName] + if !exists { + continue + } -func printTestResults(t *testing.T, logger *TestLogger, allMetrics []TestMetrics) { - header := "\n=== Consolidated Performance Test Results ===" - t.Log(header) - logger.LogResult("%s", header) + repoOp, exists := repoMetrics.repoOps[opKey] + if !exists || repoOp.Error != nil { + continue + } - subheader := "Operation Min Max Avg Total" - t.Log(subheader) - logger.LogResult("%s", subheader) + heading := operationHeadings[opKey] + if heading == "" { + heading = opKey + } + headingWithNum := fmt.Sprintf("%s R%d", heading, i) + result := fmt.Sprintf("%-37s %-11v %-11v %-11v %-11v", + headingWithNum, + repoOp.Duration.Round(time.Millisecond), + repoOp.Duration.Round(time.Millisecond), + repoOp.Duration.Round(time.Millisecond), + repoOp.Duration.Round(time.Millisecond)) + t.T().Log(result) + logger.LogResult("%s", result) + } + } + t.metricsMutex.RUnlock() - divider := "------------------------------------------------------------------------" - t.Log(divider) - logger.LogResult("%s", divider) + for _, opKey := range pkgRevOperations { + stats := operationStats[opKey] + if stats.Count == 0 { + continue + } - stats := make(map[string]Stats) + for k := 1; k <= t.testOptions.numRevs; k++ { + revStats := &Stats{} + revCount := 0 - for _, m := range allMetrics { - for _, metric := range m.Metrics { - if metric.Error != nil { - continue - } - s := stats[metric.Operation] - if s.Count == 0 || metric.Duration < s.Min { - s.Min = metric.Duration + t.metricsMutex.RLock() + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + repoMetrics, exists := t.metrics[repoName] + if !exists { + continue + } + + for j := 0; j < t.testOptions.numPkgs; j++ { + pkgName := fmt.Sprintf("network-function-%d", j) + pkgRevisions, exists := repoMetrics.pkgRevMetrics[pkgName] + if !exists { + continue + } + + pkgRevMetric, exists := pkgRevisions[k] + if !exists { + continue + } + + opMetric, exists := pkgRevMetric.Metrics[opKey] + if !exists || opMetric.Error != nil { + continue + } + + if revCount == 0 || opMetric.Duration < revStats.Min { + revStats.Min = opMetric.Duration + } + if opMetric.Duration > revStats.Max { + revStats.Max = opMetric.Duration + } + revStats.Total += opMetric.Duration + revCount++ + } } - if metric.Duration > s.Max { - s.Max = metric.Duration + t.metricsMutex.RUnlock() + + if revCount > 0 { + avg := revStats.Total / time.Duration(revCount) + heading := operationHeadings[opKey] + if heading == "" { + heading = opKey + } + headingWithRev := fmt.Sprintf("%s v%d", heading, k) + result := fmt.Sprintf("%-37s %-11v %-11v %-11v %-11v", + headingWithRev, + revStats.Min.Round(time.Millisecond), + revStats.Max.Round(time.Millisecond), + avg.Round(time.Millisecond), + revStats.Total.Round(time.Millisecond)) + t.T().Log(result) + logger.LogResult("%s", result) } - s.Total += metric.Duration - s.Count++ - stats[metric.Operation] = s } } - for op, stat := range stats { - avg := stat.Total / time.Duration(stat.Count) - result := fmt.Sprintf("%-25s %-11v %-11v %-11v %-11v", - op, - stat.Min.Round(time.Millisecond), - stat.Max.Round(time.Millisecond), - avg.Round(time.Millisecond), - stat.Total.Round(time.Millisecond)) - - t.Log(result) - logger.LogResult("%s", result) - } - - // Print errors if any hasErrors := false - for _, m := range allMetrics { - for _, metric := range m.Metrics { - if metric.Error != nil { + for i := 0; i < t.testOptions.numRepos; i++ { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, i) + testMetric, exists := t.metrics[repoName] + if !exists { + continue + } + + for _, opMetric := range testMetric.repoOps { + if opMetric.Error != nil { if !hasErrors { errHeader := "\n=== Errors Encountered ===" - t.Log(errHeader) + t.T().Log(errHeader) logger.LogResult("%s", errHeader) hasErrors = true } - errMsg := fmt.Sprintf("Repository: %s, Package: %s, Operation: %s, Error: %v", - m.RepoName, m.PkgName, metric.Operation, metric.Error) - t.Log(errMsg) + errMsg := fmt.Sprintf("Repository: %s, Operation: %s, Error: %v", + repoName, opMetric.Operation, opMetric.Error) + t.T().Log(errMsg) logger.LogResult("%s", errMsg) } } + + for j := 0; j < t.testOptions.numPkgs; j++ { + pkgName := fmt.Sprintf("network-function-%d", j) + pkgRevisions, exists := testMetric.pkgRevMetrics[pkgName] + if !exists { + continue + } + + for k := 1; k <= t.testOptions.numRevs; k++ { + pkgRevMetric, exists := pkgRevisions[k] + if !exists { + continue + } + + for _, opMetric := range pkgRevMetric.Metrics { + if opMetric.Error != nil { + if !hasErrors { + errHeader := "\n=== Errors Encountered ===" + t.T().Log(errHeader) + logger.LogResult("%s", errHeader) + hasErrors = true + } + errMsg := fmt.Sprintf("Repository: %s, Package: %s, Revision: %d, Operation: %s, Error: %v", + repoName, pkgRevMetric.pkgName, k, opMetric.Operation, opMetric.Error) + t.T().Log(errMsg) + logger.LogResult("%s", errMsg) + } + } + } + } } } -func TestMain(m *testing.M) { - // Try to load kube config from standard locations - kubeconfig := os.Getenv("KUBECONFIG") - if kubeconfig == "" { - home, err := os.UserHomeDir() - if err == nil { - kubeconfig = filepath.Join(home, ".kube", "config") +func (t *PerformanceTests) logResults(lifecycleDuration time.Duration, deletedCount *int) { + if err := t.testLogger.Sync(); err != nil { + t.T().Logf("Warning: Failed to sync test logger: %v", err) + } + + t.T().Logf("Total lifecycle duration for all operations: %v", lifecycleDuration) + t.resultsLogger.LogToFile("Total lifecycle duration for all operations: %v", lifecycleDuration) + + t.T().Log("\nGenerating CSV results...") + if err := t.generateCSVResults(); err != nil { + t.T().Logf("Warning: Failed to generate CSV results: %v", err) + } else { + t.T().Logf("- CSV results saved to: %s", t.csvOptions.lifecycleCSV) + } + + if err := t.generateDetailedOperationsCSV(); err != nil { + t.T().Logf("Warning: Failed to generate detailed operations CSV: %v", err) + } else { + t.T().Logf("- Detailed operations CSV saved to: %s", t.csvOptions.operationsCSV) + } + + if t.testOptions.enableDeletion && *deletedCount > 0 { + if err := t.generateDeletionOperationsCSV(); err != nil { + t.T().Logf("Warning: Failed to generate deletion operations CSV: %v", err) + } else { + t.T().Logf("- Deletion operations CSV saved to: %s", t.csvOptions.deletionCSV) } } - if _, err := os.Stat(kubeconfig); err == nil { - os.Setenv("KUBERNETES_MASTER", "http://localhost:8080") + t.T().Logf("- Raw results saved to: %s", t.logOptions.resultsFile) + t.T().Logf("- Detailed log saved to: %s", t.logOptions.fullLogFile) + + if err := t.resultsLogger.Sync(); err != nil { + t.T().Logf("Warning: Failed to sync results logger: %v", err) } + if err := t.testLogger.Sync(); err != nil { + t.T().Logf("Warning: Failed to sync test logger: %v", err) + } + + t.T().Log("\nTests completed!") +} + +func (t *PerformanceTests) processRepository(repoIndex, numRevs int, errorCalculator func(err error, errCount, numRevs int) bool) { + repoName := fmt.Sprintf("%s-test-%d", t.testOptions.namespace, repoIndex) + t.T().Logf("\n=== Creating Repository %d: %s ===", repoIndex+1, repoName) + t.createAndSetupRepo(repoName) - os.Exit(m.Run()) + t.metricsMutex.RLock() + for _, op := range t.metrics[repoName].repoOps { + t.resultsLogger.LogToFile("%s: %s - %v (took %.3fs)", repoName, op.Operation, op.Error, op.Duration.Seconds()) + } + t.metricsMutex.RUnlock() + + processPackage := func(pkgIndex int) { + errCount := 0 + pkgName := fmt.Sprintf("network-function-%d", pkgIndex) + t.T().Logf("\n--- Creating Package %s:%d ---", repoName, pkgIndex+1) + + t.metricsMutex.Lock() + t.metrics[repoName].pkgRevMetrics[pkgName] = make(map[int]PackageRevisionMetrics) + t.metricsMutex.Unlock() + + for k := 1; k <= numRevs; k++ { + select { + case <-t.ctx.Done(): + t.T().Logf("Test cancelled, stopping revision creation for package %s", pkgName) + return + default: + } + t.T().Logf("Creating revision %d/%d for package %s", k, t.testOptions.numRevs, pkgName) + if pkgRevName, err := t.doLifecycle(repoName, pkgName, k); err == nil { + t.metricsMutex.RLock() + for _, op := range t.metrics[repoName].pkgRevMetrics[pkgName][k].Metrics { + if op.Operation == fmt.Sprintf("%s:%d", pkgRevPublished, k) { + t.resultsLogger.LogApproved(repoName, pkgName, k, pkgRevName, op.Duration) + } else { + t.resultsLogger.LogToFile("%s:%s:%d - %s (took %.3fs)", repoName, pkgName, k, op.Operation, op.Duration.Seconds()) + } + } + t.metricsMutex.RUnlock() + } else { + t.T().Logf("An error occured during the creation/update of the package revision %s: %s", pkgRevName, err) + errCount++ + if errorCalculator(err, errCount, k) { + break + } + } + } + } + + pkgSemaphore := make(chan struct{}, t.testOptions.packageParallelism) + var pkgWg sync.WaitGroup + + for j := 0; j < t.testOptions.numPkgs; j++ { + pkgSemaphore <- struct{}{} + pkgWg.Add(1) + go func(pkgIndex int) { + defer pkgWg.Done() + defer func() { <-pkgSemaphore }() + processPackage(pkgIndex) + }(j) + } + pkgWg.Wait() } diff --git a/test/performance/prometheus_metrics.go b/test/performance/prometheus_metrics.go deleted file mode 100644 index 589c7162f..000000000 --- a/test/performance/prometheus_metrics.go +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2024, 2026 The Nephio Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package metrics - -import ( - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" -) - -var ( - // Operation duration metrics - operationDuration = promauto.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "porch_operation_duration_seconds", - Help: "Duration of Porch operations in seconds", - Buckets: []float64{0.1, 0.5, 1, 2, 5, 10, 30, 60}, - }, - []string{"operation", "repository", "package", "status"}, - ) - - // Operation counter metrics - operationCounter = promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "porch_operations_total", - Help: "Total number of Porch operations", - }, - []string{"operation", "repository", "package", "status"}, - ) - - // Repository metrics - repositoryCounter = promauto.NewCounter( - prometheus.CounterOpts{ - Name: "porch_repositories_created_total", - Help: "Total number of repositories created", - }, - ) - - // Package metrics - packageCounter = promauto.NewCounter( - prometheus.CounterOpts{ - Name: "porch_packages_created_total", - Help: "Total number of packages created", - }, - ) -) - -// recordMetric records both duration and count for an operation -func recordMetric(operation, repoName, pkgName string, duration float64, err error) { - status := "success" - if err != nil { - status = "error" - } - - operationDuration.WithLabelValues(operation, repoName, pkgName, status).Observe(duration) - operationCounter.WithLabelValues(operation, repoName, pkgName, status).Inc() -} diff --git a/test/performance/promql_queries.txt b/test/performance/promql_queries.txt deleted file mode 100644 index 26c95e033..000000000 --- a/test/performance/promql_queries.txt +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright 2024 The Nephio Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -Here are the PromQL queries to get operation timings for iterations and specific package operations: - -• Time Taken for Each Operation per Iteration -# Basic operation duration -porch_operation_duration_seconds_sum{operation="Create PackageRevision"} - -# Detailed breakdown by operation -sum by (operation) (porch_operation_duration_seconds_sum) - -# Operation duration with package and repository context -sum by (operation, package, repository) (porch_operation_duration_seconds_sum) - -# Latest operation durations -sort_desc(porch_operation_duration_seconds_sum) - - -• Specific Package Revision Operations -# Time taken for specific package revision creation -porch_operation_duration_seconds_sum{operation="Create PackageRevision", package="test-package-0"} - -# Time for package to move to proposed state -porch_operation_duration_seconds_sum{operation="Update to Proposed", package="test-package-0"} - -# Time for package to move to published state -porch_operation_duration_seconds_sum{operation="Update to Published", package="test-package-0"} - -# Time taken for package deletion -porch_operation_duration_seconds_sum{operation="Delete PackageRevision", package="test-package-0"} - - -• Comparative Analysis -# Compare durations across different operations for same package -sum by (operation) ( - porch_operation_duration_seconds_sum{package="test-package-0"} -) - -# Average duration per operation type -rate(porch_operation_duration_seconds_sum[5m]) / rate(porch_operation_duration_seconds_count[5m]) - -# Operation duration distribution -histogram_quantile(0.95, - sum by (le, operation) ( - rate(porch_operation_duration_seconds_bucket{package="test-package-0"}[5m]) - ) -) - - -• Time Series Analysis -# Operation duration over time -rate(porch_operation_duration_seconds_sum{operation="Create PackageRevision"}[5m]) - -# Compare operation times across iterations -sum by (operation) ( - increase(porch_operation_duration_seconds_sum[1h]) -) - - -• Success/Failure Analysis -# Duration of successful operations -porch_operation_duration_seconds_sum{status="success", operation="Create PackageRevision"} - -# Duration of failed operations -porch_operation_duration_seconds_sum{status="error", operation="Create PackageRevision"} - - -Example Usage: - - -• For a specific package operation: -# Get exact duration for creating package "test-package-0" -porch_operation_duration_seconds_sum{ - operation="Create PackageRevision", - package="test-package-0", - repository="porch-metrics-test-0" -} - -# Get full lifecycle timing for package "test-package-0" -sum by (operation) ( - porch_operation_duration_seconds_sum{ - package="test-package-0", - repository="porch-metrics-test-0" - } -) - - -2. For iteration analysis: -# Get timing for all operations in latest iteration -sum by (operation, package) ( - porch_operation_duration_seconds_sum{ - repository="porch-metrics-test-0" - } -) - -# Compare operation durations across iterations -rate(porch_operation_duration_seconds_sum[5m]) - / -rate(porch_operation_duration_seconds_count[5m]) - - -• For specific operation analysis: -# Detailed timing for package state transitions -sum by (operation) ( - porch_operation_duration_seconds_sum{ - operation=~"Update to.*", - package="test-package-0" - } -) - - - diff --git a/test/performance/resources/Kptfile b/test/performance/resources/Kptfile new file mode 100644 index 000000000..0464fddca --- /dev/null +++ b/test/performance/resources/Kptfile @@ -0,0 +1,13 @@ +apiVersion: kpt.dev/v1 +kind: Kptfile +metadata: + name: CHANGE_ME + annotations: + config.kubernetes.io/local-config: "true" +info: + description: network function CHANGE_ME blueprint +pipeline: + mutators: + - image: REGISTRY_URL/set-namespace:v0.4.1 + configMap: + namespace: CHANGE_ME \ No newline at end of file diff --git a/test/performance/resources/deployment.yaml b/test/performance/resources/deployment.yaml new file mode 100644 index 000000000..66c3fe5fa --- /dev/null +++ b/test/performance/resources/deployment.yaml @@ -0,0 +1,17 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: CHANGE_ME +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: CHANGE_ME + template: + metadata: + labels: + app.kubernetes.io/name: CHANGE_ME + spec: + containers: + - name: nginx + image: nginx:latest \ No newline at end of file diff --git a/test/performance/types.go b/test/performance/types.go index b5a2d3558..23305620b 100644 --- a/test/performance/types.go +++ b/test/performance/types.go @@ -1,4 +1,4 @@ -// Copyright 2024, 2026 The Nephio Authors +// Copyright 2026 The Nephio Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,27 +11,47 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + package metrics import ( "time" ) -// OperationMetrics holds metrics for a single operation +const ( + giteaRepoCreate = "GITEA-REPO-CREATE" + porchRepoCreate = "PORCH-REPO-CREATE" + repoWait = "REPO-WAIT" + pkgRevList = "LIST" + pkgRevGet = "GET" + pkgRevGetProposed = "GET-PROPOSED" + pkgRevResourcesGet = "GET-RESOURCES" + pkgRevCreate = "CREATE" + pkgRevUpdate = "UPDATE" + pkgRevPropose = "PROPOSE" + pkgRevPublished = "APPROVE" + pkgRevProposeDeletion = "PROPOSE-DELETION" + pkgRevDelete = "DELETE" +) + type OperationMetrics struct { Operation string Duration time.Duration Error error + Timestamp time.Time // When the operation started } -// TestMetrics holds metrics for a test iteration type TestMetrics struct { - RepoName string - PkgName string - Metrics []OperationMetrics + RepoName string + repoOps map[string]OperationMetrics + pkgRevMetrics map[string]map[int]PackageRevisionMetrics } -// Stats holds statistics for operations +type PackageRevisionMetrics struct { + pkgName string + Revision int + Metrics map[string]OperationMetrics +} type Stats struct { Min time.Duration Max time.Duration