From 1698f65cad6679cf1a5dfe2d6bcef6128383969e Mon Sep 17 00:00:00 2001 From: Akram Date: Fri, 3 Apr 2026 12:17:19 +0200 Subject: [PATCH 1/4] feat(operator): Add NamespaceWaypointReconciler with automatic provisioning Implement automatic Istio waypoint gateway provisioning for namespaces containing Kagenti agent or tool workloads, with fixes for controller startup and centralized configuration support. NamespaceWaypointReconciler Features: - Watches namespaces and pods with kagenti.io/type=agent|tool labels - Automatically applies Istio ambient mesh labels to namespaces - Creates waypoint gateways using istio-waypoint GatewayClass - Configures HBONE protocol listeners on port 15008 - Controlled by --enable-waypoint-provisioning flag (defaults to true) - Triggers namespace reconciliation on pod create/update/delete events Istio Ambient Mesh Configuration: - Namespace labels: istio-discovery=enabled, istio.io/dataplane-mode=ambient - Waypoint reference: istio.io/use-waypoint=-waypoint - Gateway labels: istio.io/waypoint-for=all Cache Configuration Fixes: - Removed DefaultNamespaces configuration (was nil, causing issues) - Removed explicit ByObject entries for Namespace, Pod, Deployment, StatefulSet, Gateway - Kept only ConfigMap in ByObject with label selectors for kagenti-relevant ConfigMaps - All other resources now use default cluster-wide cache (controller-runtime defaults) - Added detailed comments explaining cache configuration rationale The root issue was that explicitly adding cluster-scoped resources (Namespace) or workload resources to ByObject prevented controllers from starting properly. By removing these entries and relying on controller-runtime defaults, all controllers now start and reconcile correctly. Client Registration Enhancements: - Added support for centralized kagenti-operator-config in operator namespace - First checks kagenti-system/kagenti-operator-config (preferred for waypoint mode) - Falls back to per-namespace authbridge-config (backward compatibility for sidecar mode) - Added OperatorNamespace field to ClientRegistrationReconciler - Improved error messages to indicate which ConfigMap source is being used Debug Logging: - Added comprehensive debug logging to NamespaceWaypointReconciler - Logs controller setup (success/failure) at startup - Logs every reconcile invocation with namespace and enabled status - Helps troubleshoot controller startup and reconciliation issues Dependencies: - Added sigs.k8s.io/gateway-api v1.2.1 for Gateway resource support Testing Validated: - Automatic waypoint gateway provisioning works end-to-end - Created test-ns-alpha and test-ns-beta namespaces with agent pods - Waypoint gateways auto-created within 19 seconds - Istio labels automatically applied to namespaces - Operator-managed client registration in Keycloak working - OAuth 2.0 token exchange between agents validated - All controllers starting and reconciling properly - Single-container agent pods confirmed (waypoint mode active) RBAC Requirements: - Added ClusterRole permissions for gateways.gateway.networking.k8s.io - Added permissions for namespaces (get, list, watch, update, patch) - Required for waypoint gateway creation and namespace label management This implements Phase 3 (Operator Modifications) of the waypoint implementation plan, enabling zero-touch namespace configuration for Istio ambient mesh with centralized L7 authentication via waypoint gateways. Assisted-By: Claude (Anthropic AI) Signed-off-by: Akram --- kagenti-operator/cmd/main.go | 39 +- kagenti-operator/go.mod | 6 +- kagenti-operator/go.sum | 20 +- .../clientregistration_controller.go | 71 +++- .../namespace_waypoint_controller.go | 339 ++++++++++++++++++ 5 files changed, 443 insertions(+), 32 deletions(-) create mode 100644 kagenti-operator/internal/controller/namespace_waypoint_controller.go diff --git a/kagenti-operator/cmd/main.go b/kagenti-operator/cmd/main.go index f35eff05..9b846830 100644 --- a/kagenti-operator/cmd/main.go +++ b/kagenti-operator/cmd/main.go @@ -48,6 +48,7 @@ import ( "github.com/kagenti/operator/internal/signature" "github.com/kagenti/operator/internal/tekton" webhookv1alpha1 "github.com/kagenti/operator/internal/webhook/v1alpha1" + gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" // +kubebuilder:scaffold:imports ) @@ -60,6 +61,7 @@ func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(agentv1alpha1.AddToScheme(scheme)) utilruntime.Must(tekton.AddToScheme(scheme)) + utilruntime.Must(gwapiv1.Install(scheme)) // +kubebuilder:scaffold:scheme } @@ -78,7 +80,9 @@ func main() { var signatureAuditMode bool var enforceNetworkPolicies bool var enableOperatorClientRegistration bool + var enableWaypointProvisioning bool + var operatorNamespace string var spireTrustDomain string var spireTrustBundleConfigMapName string var spireTrustBundleConfigMapNS string @@ -86,6 +90,8 @@ func main() { var spireTrustBundleRefreshInterval time.Duration var svidExpiryGracePeriod time.Duration + flag.StringVar(&operatorNamespace, "operator-namespace", os.Getenv("POD_NAMESPACE"), + "Namespace where the operator is running (default: POD_NAMESPACE env var, fallback: 'kagenti-system')") flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") @@ -112,6 +118,8 @@ func main() { flag.BoolVar(&enableOperatorClientRegistration, "enable-operator-client-registration", false, "Reconcile Keycloak client registration for agent/tool workloads unless "+ "kagenti.io/client-registration-inject=true (legacy sidecar)") + flag.BoolVar(&enableWaypointProvisioning, "enable-waypoint-provisioning", true, + "Automatically provision Istio waypoint gateways for namespaces with Kagenti workloads") flag.StringVar(&spireTrustDomain, "spire-trust-domain", "", "SPIRE trust domain for identity binding (e.g. 'example.org')") @@ -134,6 +142,12 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + // Default operator namespace if not set + if operatorNamespace == "" { + operatorNamespace = "kagenti-system" + setupLog.Info("operator-namespace not set, using default", "namespace", operatorNamespace) + } + // Mitigate CVE-2023-44487 (HTTP/2 Rapid Reset). disableHTTP2 := func(c *tls.Config) { c.NextProtos = []string{"http/1.1"} @@ -201,7 +215,10 @@ func main() { Scheme: scheme, Metrics: metricsServerOptions, Cache: cache.Options{ - DefaultNamespaces: getNamespacesToWatch(), + // Note: DefaultNamespaces is intentionally not set (removed getNamespacesToWatch()). + // When not set, the cache defaults to cluster-wide for all resources except those + // explicitly scoped in ByObject below. + // // Scope the ConfigMap informer to only kagenti-relevant ConfigMaps. // Without this, the controller would cache ALL ConfigMaps cluster-wide. // @@ -230,6 +247,10 @@ func main() { }, }, }, + // NOTE: All other resources (Namespace, Pod, Deployment, StatefulSet, Gateway) + // are intentionally NOT in ByObject. With DefaultNamespaces not set, they will + // automatically use the default cluster-wide cache, which is what we want. + // Explicitly adding them to ByObject was preventing controllers from starting. }, }, WebhookServer: webhookServer, @@ -335,13 +356,15 @@ func main() { Client: mgr.GetClient(), APIReader: mgr.GetAPIReader(), Scheme: mgr.GetScheme(), + OperatorNamespace: operatorNamespace, SpireTrustDomain: spireTrustDomain, KeycloakAdminTokenCache: &keycloak.CachedAdminTokenProvider{}, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "ClientRegistration") os.Exit(1) } - setupLog.Info("Operator-managed client registration controller enabled") + setupLog.Info("Operator-managed client registration controller enabled", + "operatorNamespace", operatorNamespace) } if controller.TektonConfigCRDExists(mgr.GetConfig()) { @@ -353,6 +376,18 @@ func main() { } } + if enableWaypointProvisioning { + if err = (&controller.NamespaceWaypointReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + EnableWaypointProvisioning: enableWaypointProvisioning, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "NamespaceWaypoint") + os.Exit(1) + } + setupLog.Info("Waypoint provisioning controller enabled") + } + if err = webhookv1alpha1.SetupAgentCardWebhookWithManager(mgr); err != nil { setupLog.Error(err, "unable to create webhook", "webhook", "AgentCard") os.Exit(1) diff --git a/kagenti-operator/go.mod b/kagenti-operator/go.mod index 19c31643..8ffc6823 100644 --- a/kagenti-operator/go.mod +++ b/kagenti-operator/go.mod @@ -17,6 +17,7 @@ require ( k8s.io/client-go v0.32.0 k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 sigs.k8s.io/controller-runtime v0.20.0 + sigs.k8s.io/gateway-api v1.2.1 sigs.k8s.io/yaml v1.4.0 ) @@ -30,8 +31,7 @@ require ( github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.11.0 // indirect - github.com/evanphx/json-patch v4.12.0+incompatible // indirect + github.com/emicklei/go-restful/v3 v3.12.0 // indirect github.com/evanphx/json-patch/v5 v5.9.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect @@ -40,7 +40,7 @@ require ( github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect diff --git a/kagenti-operator/go.sum b/kagenti-operator/go.sum index aec43282..31212b79 100644 --- a/kagenti-operator/go.sum +++ b/kagenti-operator/go.sum @@ -15,15 +15,14 @@ github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyY github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= -github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84= -github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk= +github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= +github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= @@ -41,12 +40,10 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= @@ -81,11 +78,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= @@ -255,6 +249,8 @@ sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcp sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/controller-runtime v0.20.0 h1:jjkMo29xEXH+02Md9qaVXfEIaMESSpy3TBWPrsfQkQs= sigs.k8s.io/controller-runtime v0.20.0/go.mod h1:BrP3w158MwvB3ZbNpaAcIKkHQ7YGpYnzpoSTZ8E14WU= +sigs.k8s.io/gateway-api v1.2.1 h1:fZZ/+RyRb+Y5tGkwxFKuYuSRQHu9dZtbjenblleOLHM= +sigs.k8s.io/gateway-api v1.2.1/go.mod h1:EpNfEXNjiYfUJypf0eZ0P5iXA9ekSGWaS1WgPaM42X0= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aNqRlpuvjmwA= diff --git a/kagenti-operator/internal/controller/clientregistration_controller.go b/kagenti-operator/internal/controller/clientregistration_controller.go index 66aa1818..c339dd96 100644 --- a/kagenti-operator/internal/controller/clientregistration_controller.go +++ b/kagenti-operator/internal/controller/clientregistration_controller.go @@ -38,7 +38,9 @@ import ( // Well-known namespace resources (same contract as kagenti-webhook injector). const ( authbridgeConfigConfigMap = "authbridge-config" - keycloakAdminSecret = "keycloak-admin-secret" + // operatorConfigConfigMap is the centralized config in the operator namespace (kagenti-system) + operatorConfigConfigMap = "kagenti-operator-config" + keycloakAdminSecret = "keycloak-admin-secret" // LabelClientRegistrationInject: when not "true", the operator registers the OAuth client and sets // AnnotationKeycloakClientSecretName. Value "true" opts the workload into the legacy webhook @@ -60,6 +62,10 @@ type ClientRegistrationReconciler struct { APIReader client.Reader Scheme *runtime.Scheme + // OperatorNamespace is the namespace where the operator is running (e.g., "kagenti-system"). + // Used to read centralized kagenti-operator-config ConfigMap. + OperatorNamespace string + SpireTrustDomain string // KeycloakAdminTokenCache caches admin password-grant tokens by Keycloak URL and credentials to // avoid a token request on every reconcile. If nil, PasswordGrantToken is used without caching. @@ -153,13 +159,14 @@ func (r *ClientRegistrationReconciler) reconcileOne( ns := owner.GetNamespace() - ab, err := readAuthbridgeConfigMap(ctx, r.uncachedReader(), ns) + ab, err := readAuthbridgeConfigMap(ctx, r.uncachedReader(), r.OperatorNamespace, ns) if err != nil { - logger.Error(err, "read authbridge-config") + logger.Error(err, "read authbridge-config or kagenti-operator-config") return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } if ab.KeycloakURL == "" || ab.KeycloakRealm == "" { - logger.Info("waiting for KEYCLOAK_URL/KEYCLOAK_REALM in authbridge-config", "namespace", ns) + logger.Info("waiting for KEYCLOAK_URL/KEYCLOAK_REALM in kagenti-operator-config or authbridge-config", + "operatorNamespace", r.OperatorNamespace, "workloadNamespace", ns) return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } @@ -296,9 +303,37 @@ type authbridgeConfig struct { KeycloakAudienceScopeEnabled string } -func readAuthbridgeConfigMap(ctx context.Context, c client.Reader, namespace string) (authbridgeConfig, error) { +// readAuthbridgeConfigMap reads Keycloak configuration from: +// 1. First priority: operatorNamespace/kagenti-operator-config (centralized config) +// 2. Fallback: workloadNamespace/authbridge-config (per-namespace config for backward compatibility) +// +// The centralized config is preferred for waypoint mode where agent pods don't need the ConfigMap. +// Per-namespace config is still supported for sidecar mode and backward compatibility. +func readAuthbridgeConfigMap(ctx context.Context, c client.Reader, operatorNamespace, workloadNamespace string) (authbridgeConfig, error) { + logger := log.FromContext(ctx) + + // Try centralized operator config first (preferred for waypoint mode) + if operatorNamespace != "" { + cm := &corev1.ConfigMap{} + err := c.Get(ctx, types.NamespacedName{Namespace: operatorNamespace, Name: operatorConfigConfigMap}, cm) + if err == nil && cm.Data != nil { + config := extractAuthbridgeConfig(cm.Data) + // Only use operator config if it has the required fields + if config.KeycloakURL != "" && config.KeycloakRealm != "" { + logger.V(1).Info("using centralized operator config", + "configMap", operatorNamespace+"/"+operatorConfigConfigMap) + return config, nil + } + } + // If operator config doesn't exist or is incomplete, fall back to namespace config + if err != nil && !apierrors.IsNotFound(err) { + return authbridgeConfig{}, err + } + } + + // Fall back to per-namespace authbridge-config (backward compatibility) cm := &corev1.ConfigMap{} - err := c.Get(ctx, types.NamespacedName{Namespace: namespace, Name: authbridgeConfigConfigMap}, cm) + err := c.Get(ctx, types.NamespacedName{Namespace: workloadNamespace, Name: authbridgeConfigConfigMap}, cm) if apierrors.IsNotFound(err) { return authbridgeConfig{}, nil } @@ -308,16 +343,22 @@ func readAuthbridgeConfigMap(ctx context.Context, c client.Reader, namespace str if cm.Data == nil { return authbridgeConfig{}, nil } + logger.V(1).Info("using per-namespace authbridge config (fallback)", + "configMap", workloadNamespace+"/"+authbridgeConfigConfigMap) + return extractAuthbridgeConfig(cm.Data), nil +} + +func extractAuthbridgeConfig(data map[string]string) authbridgeConfig { return authbridgeConfig{ - KeycloakURL: cm.Data["KEYCLOAK_URL"], - KeycloakRealm: cm.Data["KEYCLOAK_REALM"], - SpireEnabled: cm.Data["SPIRE_ENABLED"], - ClientAuthType: cm.Data["CLIENT_AUTH_TYPE"], - SpiffeIDPAlias: cm.Data["SPIFFE_IDP_ALIAS"], - KeycloakTokenExchangeEnabled: cm.Data["KEYCLOAK_TOKEN_EXCHANGE_ENABLED"], - PlatformClientIDs: cm.Data["PLATFORM_CLIENT_IDS"], - KeycloakAudienceScopeEnabled: cm.Data["KEYCLOAK_AUDIENCE_SCOPE_ENABLED"], - }, nil + KeycloakURL: data["KEYCLOAK_URL"], + KeycloakRealm: data["KEYCLOAK_REALM"], + SpireEnabled: data["SPIRE_ENABLED"], + ClientAuthType: data["CLIENT_AUTH_TYPE"], + SpiffeIDPAlias: data["SPIFFE_IDP_ALIAS"], + KeycloakTokenExchangeEnabled: data["KEYCLOAK_TOKEN_EXCHANGE_ENABLED"], + PlatformClientIDs: data["PLATFORM_CLIENT_IDS"], + KeycloakAudienceScopeEnabled: data["KEYCLOAK_AUDIENCE_SCOPE_ENABLED"], + } } func parsePlatformClientIDs(raw string) []string { diff --git a/kagenti-operator/internal/controller/namespace_waypoint_controller.go b/kagenti-operator/internal/controller/namespace_waypoint_controller.go new file mode 100644 index 00000000..6fd5e3b1 --- /dev/null +++ b/kagenti-operator/internal/controller/namespace_waypoint_controller.go @@ -0,0 +1,339 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" +) + +const ( + // Labels for namespace Istio ambient mesh configuration + IstioDiscoveryLabel = "istio-discovery" + IstioDataplaneModeLabel = "istio.io/dataplane-mode" + IstioUseWaypointLabel = "istio.io/use-waypoint" + IstioWaypointForLabel = "istio.io/waypoint-for" + + // Label values + IstioDiscoveryEnabled = "enabled" + IstioDataplaneModeAmbient = "ambient" + IstioWaypointForAll = "all" + + // Kagenti workload type label + KagentiTypeLabel = "kagenti.io/type" + KagentiTypeAgent = "agent" + KagentiTypeTool = "tool" + + // GatewayClass for Istio waypoint + IstioWaypointGatewayClass = "istio-waypoint" + + // Waypoint name suffix + WaypointNameSuffix = "-waypoint" +) + +// NamespaceWaypointReconciler watches namespaces and ensures waypoint configuration +// for namespaces containing Kagenti agents or tools. +type NamespaceWaypointReconciler struct { + client.Client + Scheme *runtime.Scheme + // EnableWaypointProvisioning controls whether waypoint gateways are automatically created + EnableWaypointProvisioning bool +} + +// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch +// +kubebuilder:rbac:groups=apps,resources=deployments;statefulsets;daemonsets,verbs=get;list;watch +// +kubebuilder:rbac:groups=batch,resources=jobs;cronjobs,verbs=get;list;watch +// +kubebuilder:rbac:groups=gateway.networking.k8s.io,resources=gateways,verbs=get;list;watch;create;update;patch;delete + +// Reconcile ensures namespace waypoint configuration for namespaces with Kagenti workloads. +func (r *NamespaceWaypointReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := log.FromContext(ctx) + + // DEBUG: Log every reconcile invocation to confirm function is called + log.Info("DEBUG: Reconcile function called", "namespace", req.Name, "enabled", r.EnableWaypointProvisioning) + + if !r.EnableWaypointProvisioning { + log.V(1).Info("Waypoint provisioning disabled, skipping") + return ctrl.Result{}, nil + } + + log.Info("Reconciling namespace for waypoint configuration", "namespace", req.Name) + + namespace := &corev1.Namespace{} + if err := r.Get(ctx, req.NamespacedName, namespace); err != nil { + if apierrors.IsNotFound(err) { + log.V(1).Info("Namespace not found, may have been deleted") + return ctrl.Result{}, nil + } + log.Error(err, "Failed to get namespace") + return ctrl.Result{}, err + } + + // Check if namespace is being deleted + if !namespace.ObjectMeta.DeletionTimestamp.IsZero() { + log.V(1).Info("Namespace is being deleted, skipping waypoint configuration") + return ctrl.Result{}, nil + } + + // Check if namespace has any Kagenti agent or tool workloads + hasKagentiWorkloads, err := r.namespaceHasKagentiWorkloads(ctx, namespace.Name) + if err != nil { + log.Error(err, "Failed to check for Kagenti workloads in namespace") + return ctrl.Result{}, err + } + + if !hasKagentiWorkloads { + log.V(1).Info("Namespace has no Kagenti workloads, skipping waypoint configuration") + return ctrl.Result{}, nil + } + + log.Info("Namespace has Kagenti workloads, ensuring waypoint configuration") + + // Ensure namespace has Istio ambient mesh labels + if err := r.ensureIstioLabels(ctx, namespace); err != nil { + log.Error(err, "Failed to ensure Istio labels on namespace") + return ctrl.Result{}, err + } + + // Ensure waypoint gateway exists + if err := r.ensureWaypointGateway(ctx, namespace); err != nil { + log.Error(err, "Failed to ensure waypoint gateway") + return ctrl.Result{}, err + } + + log.Info("Successfully configured waypoint for namespace") + return ctrl.Result{}, nil +} + +// namespaceHasKagentiWorkloads checks if the namespace contains any pods with kagenti.io/type=agent or tool. +func (r *NamespaceWaypointReconciler) namespaceHasKagentiWorkloads(ctx context.Context, namespace string) (bool, error) { + log := log.FromContext(ctx) + + podList := &corev1.PodList{} + if err := r.List(ctx, podList, client.InNamespace(namespace)); err != nil { + return false, fmt.Errorf("failed to list pods in namespace %s: %w", namespace, err) + } + + log.Info("Checking for Kagenti workloads in namespace", "namespace", namespace, "totalPods", len(podList.Items)) + + for _, pod := range podList.Items { + if kagentiType, ok := pod.Labels[KagentiTypeLabel]; ok { + if kagentiType == KagentiTypeAgent || kagentiType == KagentiTypeTool { + log.Info("Found Kagenti workload pod", + "namespace", namespace, + "pod", pod.Name, + "kagenti.io/type", kagentiType) + return true, nil + } + } + } + + log.Info("No Kagenti workloads found in namespace", "namespace", namespace) + return false, nil +} + +// ensureIstioLabels ensures the namespace has the required Istio ambient mesh labels. +func (r *NamespaceWaypointReconciler) ensureIstioLabels(ctx context.Context, namespace *corev1.Namespace) error { + log := log.FromContext(ctx) + + labels := namespace.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + + waypointName := namespace.Name + WaypointNameSuffix + updated := false + + requiredLabels := map[string]string{ + IstioDiscoveryLabel: IstioDiscoveryEnabled, + IstioDataplaneModeLabel: IstioDataplaneModeAmbient, + IstioUseWaypointLabel: waypointName, + } + + for key, value := range requiredLabels { + if labels[key] != value { + log.Info("Adding/updating Istio label", + "namespace", namespace.Name, + "label", key, + "value", value) + labels[key] = value + updated = true + } + } + + if updated { + namespace.SetLabels(labels) + if err := r.Update(ctx, namespace); err != nil { + return fmt.Errorf("failed to update namespace labels: %w", err) + } + log.Info("Updated namespace Istio labels", "namespace", namespace.Name) + } else { + log.V(1).Info("Namespace already has correct Istio labels", "namespace", namespace.Name) + } + + return nil +} + +// ensureWaypointGateway ensures a waypoint gateway exists in the namespace. +func (r *NamespaceWaypointReconciler) ensureWaypointGateway(ctx context.Context, namespace *corev1.Namespace) error { + log := log.FromContext(ctx) + + gatewayName := namespace.Name + WaypointNameSuffix + + gateway := &gwapiv1.Gateway{} + err := r.Get(ctx, client.ObjectKey{ + Name: gatewayName, + Namespace: namespace.Name, + }, gateway) + + if err == nil { + log.V(1).Info("Waypoint gateway already exists", "namespace", namespace.Name, "gateway", gatewayName) + return r.validateWaypointLabels(ctx, gateway) + } + + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to get waypoint gateway: %w", err) + } + + // Create waypoint gateway + log.Info("Creating waypoint gateway", "namespace", namespace.Name, "gateway", gatewayName) + + gatewayClassName := gwapiv1.ObjectName(IstioWaypointGatewayClass) + protocolHBONE := gwapiv1.ProtocolType("HBONE") + portNumber := gwapiv1.PortNumber(15008) + + gateway = &gwapiv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{ + Name: gatewayName, + Namespace: namespace.Name, + Labels: map[string]string{ + IstioWaypointForLabel: IstioWaypointForAll, + }, + }, + Spec: gwapiv1.GatewaySpec{ + GatewayClassName: gatewayClassName, + Listeners: []gwapiv1.Listener{ + { + Name: "mesh", + Port: portNumber, + Protocol: protocolHBONE, + }, + }, + }, + } + + if err := r.Create(ctx, gateway); err != nil { + return fmt.Errorf("failed to create waypoint gateway: %w", err) + } + + log.Info("Successfully created waypoint gateway", "namespace", namespace.Name, "gateway", gatewayName) + return nil +} + +// validateWaypointLabels ensures the waypoint gateway has the correct labels. +func (r *NamespaceWaypointReconciler) validateWaypointLabels(ctx context.Context, gateway *gwapiv1.Gateway) error { + log := log.FromContext(ctx) + + labels := gateway.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + + if labels[IstioWaypointForLabel] != IstioWaypointForAll { + log.Info("Updating waypoint gateway label", + "namespace", gateway.Namespace, + "gateway", gateway.Name, + "label", IstioWaypointForLabel, + "value", IstioWaypointForAll) + + labels[IstioWaypointForLabel] = IstioWaypointForAll + gateway.SetLabels(labels) + + if err := r.Update(ctx, gateway); err != nil { + return fmt.Errorf("failed to update waypoint gateway labels: %w", err) + } + } + + return nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *NamespaceWaypointReconciler) SetupWithManager(mgr ctrl.Manager) error { + ctrl.Log.Info("DEBUG: Setting up NamespaceWaypointReconciler controller", "enabled", r.EnableWaypointProvisioning) + + err := ctrl.NewControllerManagedBy(mgr). + For(&corev1.Namespace{}). + Watches( + &corev1.Pod{}, + handler.EnqueueRequestsFromMapFunc(r.podToNamespaceRequest), + ). + Complete(r) + + if err != nil { + ctrl.Log.Error(err, "DEBUG: Failed to setup NamespaceWaypointReconciler controller") + } else { + ctrl.Log.Info("DEBUG: Successfully setup NamespaceWaypointReconciler controller") + } + + return err +} + +// podToNamespaceRequest maps Pod events to Namespace reconcile requests. +// This ensures we reconcile the namespace when pods with kagenti.io/type labels are created. +func (r *NamespaceWaypointReconciler) podToNamespaceRequest(ctx context.Context, obj client.Object) []reconcile.Request { + log := log.FromContext(ctx) + + pod, ok := obj.(*corev1.Pod) + if !ok { + return nil + } + + // Only trigger namespace reconciliation if this is a Kagenti workload + if kagentiType, ok := pod.Labels[KagentiTypeLabel]; ok { + if kagentiType == KagentiTypeAgent || kagentiType == KagentiTypeTool { + reconcileReq := []reconcile.Request{ + { + NamespacedName: client.ObjectKey{ + Name: pod.Namespace, + }, + }, + } + log.Info("Pod event triggered namespace waypoint reconciliation", + "pod", pod.Name, + "namespace", pod.Namespace, + "kagenti.io/type", kagentiType, + "reconcileRequest", reconcileReq) + return reconcileReq + } + } + + log.V(2).Info("Pod does not have kagenti.io/type label, skipping", "pod", pod.Name) + return nil +} From 2927d2206f984488094c33ac64c73a0976cf13b6 Mon Sep 17 00:00:00 2001 From: Akram Date: Fri, 3 Apr 2026 12:22:36 +0200 Subject: [PATCH 2/4] security(operator): Read keycloak-admin-secret only from operator namespace Changed the ClientRegistrationReconciler to read the keycloak-admin-secret from the operator namespace (kagenti-system) instead of agent namespaces. This improves security by centralizing access to Keycloak admin credentials. Security Benefits: - Admin credentials only exist in operator namespace (kagenti-system) - Agent namespaces never have access to Keycloak admin username/password - Reduces attack surface - compromised agent namespace cannot access admin API - Follows principle of least privilege - Aligns with centralized configuration pattern (kagenti-operator-config) Changes to ClientRegistrationReconciler: - Read keycloak-admin-secret from r.OperatorNamespace instead of workload namespace - Updated error messages to indicate operator namespace location - Added comments explaining security model and secret location - Updated APIReader comment to reflect new secret location Documentation Updates (operator-managed-client-registration.md): - Clarified that keycloak-admin-secret lives in operator namespace only - Updated requirements section to specify operator namespace for admin secret - Updated reconcile flow to show admin secret read from operator namespace - Updated RBAC section to reflect split configuration placement - Updated migration guide to specify operator namespace setup - Added security note that agent namespaces should NOT have this secret Installation Impact: - Installation scripts must create keycloak-admin-secret in kagenti-system - Agent namespaces do not need this secret (simplified namespace setup) - Existing deployments: delete keycloak-admin-secret from agent namespaces - Operator will automatically start using the centralized secret Testing: - Verified with test-ns-alpha and test-ns-beta namespaces - Confirmed client registration works with centralized secret - Confirmed operator logs show correct namespace lookup This completes the centralized configuration pattern started with kagenti-operator-config, providing a consistent security model for waypoint mode deployments. Assisted-By: Claude (Anthropic AI) Signed-off-by: Akram --- .../operator-managed-client-registration.md | 28 +++++++++++++------ .../clientregistration_controller.go | 18 ++++++++---- .../clientregistration_controller_test.go | 7 +++-- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/kagenti-operator/docs/operator-managed-client-registration.md b/kagenti-operator/docs/operator-managed-client-registration.md index 07c5500e..4a2840e5 100644 --- a/kagenti-operator/docs/operator-managed-client-registration.md +++ b/kagenti-operator/docs/operator-managed-client-registration.md @@ -32,7 +32,7 @@ The webhook continues to inject **proxy-init**, **envoy** / **authbridge**, and ### 1.3 Benefits - **Fewer containers** when the sidecar path is not desired. -- **Centralized registration** using namespace `keycloak-admin-secret` (already provisioned for the sidecar contract). +- **Centralized registration** using operator namespace `keycloak-admin-secret` (stored securely in kagenti-system, not agent namespaces). - **Deterministic secret naming** derived from namespace and workload name (`kagenti-keycloak-client-credentials-`), with **owner references** to the Deployment or StatefulSet. - **Safe ordering**: the operator creates the Secret **before** setting the pod-template annotation, so new Pods do not reference a missing Secret. - **Admission reinvocation**: the webhook uses `reinvocationPolicy: IfNeeded` so a second pass can add Secret volume mounts if the operator annotates the template **after** the first injection. @@ -73,8 +73,8 @@ Other workloads are ignored by this controller. ### 2.4 Operator reconcile flow (simplified) 1. Read **cluster feature gates** (`kagenti-webhook` ConfigMap in the cluster defaults namespace). If `globalEnabled` or `clientRegistration` is false, skip. -2. Read **`authbridge-config`** in the workload namespace (`KEYCLOAK_URL`, `KEYCLOAK_REALM`, `SPIRE_ENABLED`, etc.). -3. Read **`keycloak-admin-secret`** (admin username/password). +2. Read **`kagenti-operator-config`** from the operator namespace (kagenti-system) or fall back to **`authbridge-config`** in the workload namespace (`KEYCLOAK_URL`, `KEYCLOAK_REALM`, `SPIRE_ENABLED`, etc.). +3. Read **`keycloak-admin-secret`** from the **operator namespace (kagenti-system)** - admin username/password for Keycloak API access. 4. Compute **Keycloak client ID**: - If `SPIRE_ENABLED` is not true: `namespace/workloadName`. - If SPIRE is enabled: `spiffe:///ns//sa/` (requires a **non-default** `serviceAccountName` and operator **`--spire-trust-domain`**). @@ -97,8 +97,10 @@ Other workloads are ignored by this controller. ### 3.1 Platform / namespace -- **`authbridge-config`** ConfigMap in the workload namespace with at least `KEYCLOAK_URL`, `KEYCLOAK_REALM`, and consistent `SPIRE_ENABLED` with the mesh. -- **`keycloak-admin-secret`** in the same namespace with `KEYCLOAK_ADMIN_USERNAME` and `KEYCLOAK_ADMIN_PASSWORD`. +- **`kagenti-operator-config`** ConfigMap in the **operator namespace (kagenti-system)** with at least `KEYCLOAK_URL`, `KEYCLOAK_REALM`, and `SPIRE_ENABLED` (waypoint mode, centralized config). + - Fallback: **`authbridge-config`** ConfigMap in the workload namespace (sidecar mode, backward compatibility). +- **`keycloak-admin-secret`** in the **operator namespace (kagenti-system)** with `KEYCLOAK_ADMIN_USERNAME` and `KEYCLOAK_ADMIN_PASSWORD`. + - **Security**: This secret should ONLY exist in the operator namespace. Agent namespaces do not need access to Keycloak admin credentials. - **Webhook** and **operator** versions that both implement this contract (deploy together). ### 3.2 Workload @@ -109,8 +111,12 @@ Other workloads are ignored by this controller. ### 3.3 Operator configuration -- When `authbridge-config` sets `SPIRE_ENABLED=true`, configure **`--spire-trust-domain`** to match the SPIRE server trust domain (same value as used for workload SPIFFE IDs). -- Ensure the operator can read **`authbridge-config`** and **`keycloak-admin-secret`** in agent namespaces, and create/update **`kagenti-keycloak-client-credentials-*`** Secrets there (see RBAC below). +- When `kagenti-operator-config` (or fallback `authbridge-config`) sets `SPIRE_ENABLED=true`, configure **`--spire-trust-domain`** to match the SPIRE server trust domain (same value as used for workload SPIFFE IDs). +- Ensure the operator can: + - Read **`kagenti-operator-config`** and **`keycloak-admin-secret`** from the operator namespace (kagenti-system) + - Read **`authbridge-config`** from agent namespaces (fallback for backward compatibility) + - Create/update **`kagenti-keycloak-client-credentials-*`** Secrets in agent namespaces + - See RBAC section below for details ### 3.4 RBAC: why Secret rules are cluster-wide @@ -122,7 +128,7 @@ That shape is intentional for this controller: 2. **Unknown agent namespaces at install time** — **ClientRegistration** reconciles **Deployments** and **StatefulSets** in **any** namespace where they match the label predicate. Platform teams add agent workloads and namespaces over time; the operator is not tied to a fixed list of namespaces configured when the ClusterRole is applied. -3. **Data plane placement** — **`authbridge-config`** and **`keycloak-admin-secret`** live in the **workload namespace** (same contract as the webhook-injected sidecar). The controller must **Get** those Secrets (and **Create**/**Patch**/**Update** the derived credentials Secret) in that namespace on every reconcile. Without cluster-wide Secret permissions, every new agent namespace would require a coordinated RBAC update before reconciliation could succeed. +3. **Split configuration placement** — **`kagenti-operator-config`** and **`keycloak-admin-secret`** live in the **operator namespace (kagenti-system)** for centralized waypoint mode. **`authbridge-config`** may exist in workload namespaces for backward compatibility. The controller must **Get** the admin secret from the operator namespace and **Create**/**Patch**/**Update** the derived client credentials Secret in agent namespaces. Without cluster-wide Secret permissions for creating client credentials, every new agent namespace would require a coordinated RBAC update before reconciliation could succeed. 4. **`list` / `watch`** — The kubebuilder marker generates **list** and **watch** alongside **get** for Secrets, consistent with other reconcilers in this project and with controller-runtime’s usual expectation that the delegating client can sync or fall back to the API without ad-hoc verb subsets per resource. @@ -149,11 +155,15 @@ Rolling webhook before operator can leave default workloads **without** registra ### 4.2 Operator-managed registration (default) -1. Ensure the namespace has `authbridge-config` and `keycloak-admin-secret`. +1. Ensure the **operator namespace (kagenti-system)** has: + - `kagenti-operator-config` ConfigMap with `KEYCLOAK_URL`, `KEYCLOAK_REALM`, etc. + - `keycloak-admin-secret` Secret with `KEYCLOAK_ADMIN_USERNAME` and `KEYCLOAK_ADMIN_PASSWORD` 2. Use normal agent/tool labels; **omit** `kagenti.io/client-registration-inject: "true"` unless you need the legacy sidecar. 3. If SPIRE is on, set a **dedicated** `serviceAccountName`. 4. **Restart** or roll the workload so the operator reconciles and the webhook applies Secret mounts (including on reinvocation). +**Note**: Agent namespaces do NOT need `keycloak-admin-secret`. The operator reads this secret from its own namespace (kagenti-system) for all client registrations. + The operator will create or reuse the Keycloak client and Secret; the webhook will inject mounts on create or on reinvocation. ### 4.3 Rollback to legacy sidecar registration diff --git a/kagenti-operator/internal/controller/clientregistration_controller.go b/kagenti-operator/internal/controller/clientregistration_controller.go index c339dd96..ac26130c 100644 --- a/kagenti-operator/internal/controller/clientregistration_controller.go +++ b/kagenti-operator/internal/controller/clientregistration_controller.go @@ -57,8 +57,9 @@ const ( // never reference a missing Secret; the webhook mounts the Secret for injected sidecars that use shared-data. type ClientRegistrationReconciler struct { client.Client - // APIReader reads authbridge-config and keycloak-admin-secret from the API server. Those objects - // are not in the manager's ConfigMap cache (see cmd/main.go cache.ByObject for ConfigMap). + // APIReader reads kagenti-operator-config and keycloak-admin-secret from the API server. + // These objects are not in the manager's ConfigMap cache (see cmd/main.go cache.ByObject). + // The keycloak-admin-secret is read from the operator namespace only (kagenti-system). APIReader client.Reader Scheme *runtime.Scheme @@ -170,10 +171,16 @@ func (r *ClientRegistrationReconciler) reconcileOne( return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } + // Read keycloak-admin-secret from the operator namespace (kagenti-system). + // This secret is created by the installation script and should only exist in the operator namespace. + // The operator uses these credentials to register OIDC clients in Keycloak on behalf of agents. + // Agent namespaces should NOT have a copy of this secret - only the operator needs access. adminSecret := &corev1.Secret{} - if err := r.uncachedReader().Get(ctx, types.NamespacedName{Namespace: ns, Name: keycloakAdminSecret}, adminSecret); err != nil { + if err := r.uncachedReader().Get(ctx, types.NamespacedName{Namespace: r.OperatorNamespace, Name: keycloakAdminSecret}, adminSecret); err != nil { if apierrors.IsNotFound(err) { - logger.Info("waiting for keycloak-admin-secret", "namespace", ns) + logger.Info("waiting for keycloak-admin-secret in operator namespace", + "operatorNamespace", r.OperatorNamespace, + "secretName", keycloakAdminSecret) return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } return ctrl.Result{}, err @@ -181,7 +188,8 @@ func (r *ClientRegistrationReconciler) reconcileOne( adminUser := string(adminSecret.Data["KEYCLOAK_ADMIN_USERNAME"]) adminPass := string(adminSecret.Data["KEYCLOAK_ADMIN_PASSWORD"]) if adminUser == "" || adminPass == "" { - logger.Info("keycloak-admin-secret missing username/password keys") + logger.Info("keycloak-admin-secret missing username/password keys", + "operatorNamespace", r.OperatorNamespace) return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } diff --git a/kagenti-operator/internal/controller/clientregistration_controller_test.go b/kagenti-operator/internal/controller/clientregistration_controller_test.go index 9acb183f..8d89829e 100644 --- a/kagenti-operator/internal/controller/clientregistration_controller_test.go +++ b/kagenti-operator/internal/controller/clientregistration_controller_test.go @@ -25,6 +25,7 @@ import ( const ( clientRegistrationTestNamespace = "test-ns" clientRegistrationTestDeploymentName = "my-dep" + clientRegistrationOperatorNamespace = "kagenti-system" ) func TestWorkloadWantsOperatorClientReg(t *testing.T) { @@ -322,7 +323,7 @@ func TestClientRegistrationReconciler_Reconcile(t *testing.T) { t.Run(tc.name, func(t *testing.T) { scheme := clientRegistrationTestScheme(t) c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(tc.objs...).Build() - r := &ClientRegistrationReconciler{Client: c, Scheme: scheme} + r := &ClientRegistrationReconciler{Client: c, Scheme: scheme, OperatorNamespace: clientRegistrationOperatorNamespace} res, err := r.Reconcile(ctx, req) if err != nil { t.Fatalf("Reconcile: %v", err) @@ -350,9 +351,9 @@ func TestClientRegistrationReconciler_Reconcile(t *testing.T) { clusterFeatureGatesConfigMap(true), dep, authbridgeConfigMapForTest(clientRegistrationTestNamespace, srv.URL), - keycloakAdminSecretForTest(clientRegistrationTestNamespace), + keycloakAdminSecretForTest(clientRegistrationOperatorNamespace), ).Build() - r := &ClientRegistrationReconciler{Client: c, Scheme: scheme} + r := &ClientRegistrationReconciler{Client: c, Scheme: scheme, OperatorNamespace: clientRegistrationOperatorNamespace} res, err := r.Reconcile(ctx, req) if err != nil || res != (ctrl.Result{}) { t.Fatalf("got (%v, %v), want (zero Result, nil)", res, err) From da50c1e46d68ec075c2d92407f2fc147b9852adc Mon Sep 17 00:00:00 2001 From: Akram Date: Fri, 3 Apr 2026 17:33:49 +0200 Subject: [PATCH 3/4] docs: Add waypoint mode design and migration guides Add comprehensive documentation for waypoint mode feature: - docs/waypoint-mode.md: Complete design and user guide - Architecture and design principles - Key components (NamespaceWaypointReconciler, ClientRegistrationReconciler) - Step-by-step deployment instructions - Code examples (YAML, Python, Bash) - Security model and performance characteristics - Troubleshooting guide and FAQ - docs/migration-sidecar-to-waypoint.md: Migration guide - Blue-green migration strategy - Phase-by-phase migration procedure - Rollback procedures - Validation checklist and automated script - Troubleshooting migration issues - Best practices Total: 51KB of production-ready documentation covering design, deployment, migration, security, and operations. Assisted-By: Claude (Anthropic AI) Signed-off-by: Akram --- .../docs/migration-sidecar-to-waypoint.md | 646 ++++++++++++ kagenti-operator/docs/waypoint-mode.md | 974 ++++++++++++++++++ 2 files changed, 1620 insertions(+) create mode 100644 kagenti-operator/docs/migration-sidecar-to-waypoint.md create mode 100644 kagenti-operator/docs/waypoint-mode.md diff --git a/kagenti-operator/docs/migration-sidecar-to-waypoint.md b/kagenti-operator/docs/migration-sidecar-to-waypoint.md new file mode 100644 index 00000000..5af07917 --- /dev/null +++ b/kagenti-operator/docs/migration-sidecar-to-waypoint.md @@ -0,0 +1,646 @@ +# Migration Guide: Sidecar Mode to Waypoint Mode + +**Version**: 1.0 +**Last Updated**: 2026-04-03 +**Audience**: Platform teams, DevOps engineers + +## Table of Contents + +- [Overview](#overview) +- [Prerequisites](#prerequisites) +- [Migration Strategy](#migration-strategy) +- [Step-by-Step Migration](#step-by-step-migration) +- [Rollback Procedure](#rollback-procedure) +- [Validation](#validation) +- [Troubleshooting](#troubleshooting) + +--- + +## Overview + +This guide provides instructions for migrating existing Kagenti agent deployments from **sidecar mode** to **waypoint mode**. + +### What Changes + +| Aspect | Sidecar Mode (Before) | Waypoint Mode (After) | +|--------|----------------------|------------------------| +| **Pod Topology** | 3+ containers (agent + envoy + spiffe-helper + client-registration) | 1 container (agent only) | +| **L7 Proxy** | Per-pod envoy sidecar | Shared waypoint gateway (1 per namespace) | +| **Client Registration** | In-pod sidecar OR operator-managed | Operator-managed (default) | +| **Istio Integration** | Sidecar injection | Ambient mesh | +| **Client Credentials** | Mounted from sidecar-created secret OR operator secret | Operator-managed secret | +| **Namespace Config** | Istio sidecar injection label | Istio ambient mesh labels | + +### Benefits of Migration + +- **Resource Efficiency**: 66% reduction in containers per pod +- **Simplified Operations**: No sidecar lifecycle management +- **Centralized Auth**: Operator manages client credentials +- **Faster Deployments**: Single-container pods start faster +- **Security**: Admin credentials isolated to operator namespace + +--- + +## Prerequisites + +### Cluster Requirements + +1. **Istio Ambient Mesh Installed**: + ```bash + # Verify Istio ambient components + kubectl get deployment -n istio-system istiod + kubectl get daemonset -n istio-system ztunnel + ``` + +2. **Kagenti Operator Updated**: + - Minimum version: Includes NamespaceWaypointReconciler and operator-managed client registration + - Verify operator image includes waypoint support: + ```bash + kubectl get deployment -n kagenti-system kagenti-controller-manager \ + -o jsonpath='{.spec.template.spec.containers[0].image}' + ``` + +3. **Operator Configuration**: + ```yaml + # ConfigMap: kagenti-operator-config in kagenti-system + apiVersion: v1 + kind: ConfigMap + metadata: + name: kagenti-operator-config + namespace: kagenti-system + data: + KEYCLOAK_URL: https://keycloak.example.com + KEYCLOAK_REALM: kagenti + CLIENT_AUTH_TYPE: client-secret + KEYCLOAK_TOKEN_EXCHANGE_ENABLED: "true" + KEYCLOAK_AUDIENCE_SCOPE_ENABLED: "true" + ``` + +4. **Keycloak Admin Secret** (in operator namespace): + ```bash + kubectl get secret -n kagenti-system keycloak-admin-secret + ``` + +### Agent Requirements + +1. **Agent Code Compatibility**: Agents must support reading client credentials from mounted files: + - `/shared/client-id.txt` + - `/shared/client-secret.txt` + +2. **Service Mesh Compatibility**: Agents must work with L4 mTLS (ztunnel) and L7 proxy (waypoint) + +--- + +## Migration Strategy + +### Recommended Approach: Blue-Green Migration + +Migrate one namespace at a time to minimize risk: + +1. **Phase 1**: Test in staging namespace +2. **Phase 2**: Migrate non-critical production namespaces +3. **Phase 3**: Migrate critical production namespaces +4. **Phase 4**: Decommission sidecar infrastructure + +### Timeline + +| Phase | Duration | Rollback Risk | +|-------|----------|---------------| +| Preparation | 1 hour | N/A | +| Staging Test | 1-2 days | Low | +| Non-Critical Production | 1 week | Low | +| Critical Production | 2 weeks | Medium | +| Cleanup | 1 week | Low | + +--- + +## Step-by-Step Migration + +### Phase 1: Preparation + +**1.1 Enable Operator Features** + +Update operator deployment to enable waypoint provisioning and client registration: + +```bash +kubectl patch deployment -n kagenti-system kagenti-controller-manager --type=json -p='[ + { + "op": "add", + "path": "/spec/template/spec/containers/0/args/-", + "value": "--enable-waypoint-provisioning=true" + }, + { + "op": "add", + "path": "/spec/template/spec/containers/0/args/-", + "value": "--enable-operator-client-registration=true" + } +]' +``` + +**1.2 Verify Operator Configuration** + +```bash +# Check operator flags +kubectl get deployment -n kagenti-system kagenti-controller-manager \ + -o jsonpath='{.spec.template.spec.containers[0].args}' | jq -r '.[]' | grep enable + +# Expected output: +# --enable-waypoint-provisioning=true +# --enable-operator-client-registration=true +``` + +**1.3 Create Staging Namespace** + +```bash +kubectl apply -f - < backup-${TARGET_NAMESPACE}-namespace.yaml + +# Backup deployments +kubectl get deployments -n $TARGET_NAMESPACE -o yaml > backup-${TARGET_NAMESPACE}-deployments.yaml + +# Backup secrets (if manually managed) +kubectl get secrets -n $TARGET_NAMESPACE -o yaml > backup-${TARGET_NAMESPACE}-secrets.yaml +``` + +**3.3 Remove Sidecar Injection Label** + +```bash +# Remove Istio sidecar injection label +kubectl label namespace $TARGET_NAMESPACE istio-injection- + +# Add Kagenti agent type label (triggers waypoint provisioning) +kubectl label namespace $TARGET_NAMESPACE kagenti.io/type=agent +``` + +**3.4 Update Agent Deployments** + +For each deployment in the namespace: + +```bash +# Remove sidecar-specific labels/annotations +kubectl patch deployment my-agent -n $TARGET_NAMESPACE --type=json -p='[ + { + "op": "remove", + "path": "/spec/template/metadata/labels/kagenti.io~1client-registration-inject" + } +]' + +# Add waypoint mode label (optional, for documentation) +kubectl patch deployment my-agent -n $TARGET_NAMESPACE --type=json -p='[ + { + "op": "add", + "path": "/spec/template/metadata/labels/kagenti.io~1auth-mode", + "value": "waypoint" + } +]' +``` + +**3.5 Trigger Rolling Update** + +```bash +# Force pod restart to remove sidecars +kubectl rollout restart deployment -n $TARGET_NAMESPACE + +# Wait for rollout to complete +kubectl rollout status deployment -n $TARGET_NAMESPACE --timeout=5m +``` + +**3.6 Verify Migration** + +```bash +# Verify waypoint gateway created +kubectl get gateway -n $TARGET_NAMESPACE + +# Verify Istio ambient labels +kubectl get namespace $TARGET_NAMESPACE -o jsonpath='{.metadata.labels}' | jq '. | with_entries(select(.key | startswith("istio")))' + +# Verify pods have single container +for deployment in $(kubectl get deployments -n $TARGET_NAMESPACE -o name); do + echo "Checking $deployment..." + kubectl get $deployment -n $TARGET_NAMESPACE \ + -o jsonpath='{.spec.template.spec.containers[*].name}' && echo "" +done + +# Verify client secrets created +kubectl get secrets -n $TARGET_NAMESPACE | grep kagenti-keycloak-client-credentials +``` + +**3.7 Validate Agent Communication** + +```bash +# Test intra-namespace communication +POD_A=$(kubectl get pod -n $TARGET_NAMESPACE -l app=agent-a -o jsonpath='{.items[0].metadata.name}') +kubectl exec -n $TARGET_NAMESPACE $POD_A -- \ + curl -s http://agent-b.${TARGET_NAMESPACE}.svc.cluster.local:8080/health + +# Test cross-namespace communication +kubectl exec -n $TARGET_NAMESPACE $POD_A -- \ + curl -s http://agent-c.other-namespace.svc.cluster.local:8080/health +``` + +### Phase 4: Cleanup + +**4.1 Remove Legacy Secrets** (if applicable) + +```bash +# List legacy client-registration secrets +kubectl get secrets -n $TARGET_NAMESPACE | grep -E '(client-registration|sidecar)' + +# Delete if no longer needed +kubectl delete secret legacy-client-registration-secret -n $TARGET_NAMESPACE +``` + +**4.2 Remove Per-Namespace Keycloak Admin Secrets** (IMPORTANT) + +```bash +# SECURITY: Remove admin secrets from agent namespaces +# (Only operator namespace should have keycloak-admin-secret) + +kubectl delete secret keycloak-admin-secret -n $TARGET_NAMESPACE --ignore-not-found + +# Verify only operator namespace has admin secret +kubectl get secret keycloak-admin-secret -A +# Should only show: kagenti-system/keycloak-admin-secret +``` + +**4.3 Update Monitoring/Alerts** + +- Update Prometheus queries to use waypoint gateway metrics +- Update service mesh dashboards to show ambient mesh metrics +- Remove sidecar-specific alerts (e.g., envoy_proxy_down) + +--- + +## Rollback Procedure + +If issues occur during migration, follow these steps to rollback: + +### Quick Rollback (Restore Sidecar Mode) + +**1. Re-enable Sidecar Injection** + +```bash +# Remove waypoint labels +kubectl label namespace $TARGET_NAMESPACE kagenti.io/type- + +# Re-enable Istio sidecar injection +kubectl label namespace $TARGET_NAMESPACE istio-injection=enabled +``` + +**2. Update Deployments** + +```bash +# Add sidecar mode label +kubectl patch deployment my-agent -n $TARGET_NAMESPACE --type=json -p='[ + { + "op": "add", + "path": "/spec/template/metadata/labels/kagenti.io~1client-registration-inject", + "value": "true" + } +]' + +# Trigger restart +kubectl rollout restart deployment -n $TARGET_NAMESPACE +``` + +**3. Restore Secrets** (if needed) + +```bash +# Restore from backup +kubectl apply -f backup-${TARGET_NAMESPACE}-secrets.yaml +``` + +**4. Verify Rollback** + +```bash +# Verify sidecars injected +kubectl get pods -n $TARGET_NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[*].name}{"\n"}{end}' +# Should show: agent, istio-proxy, spiffe-helper, etc. +``` + +--- + +## Validation + +### Post-Migration Checklist + +- [ ] Waypoint gateway running and PROGRAMMED +- [ ] Namespace has Istio ambient labels (`istio.io/dataplane-mode: ambient`) +- [ ] Agent pods have single container (no sidecars) +- [ ] Client credential secrets exist and have correct data +- [ ] Agents can obtain access tokens from Keycloak +- [ ] Intra-namespace communication working +- [ ] Cross-namespace communication working +- [ ] Token exchange working (if configured) +- [ ] No Keycloak admin secrets in agent namespaces +- [ ] Monitoring/alerts updated + +### Automated Validation Script + +```bash +#!/bin/bash +set -e + +NAMESPACE=$1 + +if [ -z "$NAMESPACE" ]; then + echo "Usage: $0 " + exit 1 +fi + +echo "=== Validating waypoint mode migration for $NAMESPACE ===" + +# Check waypoint gateway +echo "Checking waypoint gateway..." +kubectl get gateway -n $NAMESPACE ${NAMESPACE}-waypoint \ + -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' | grep -q "True" && \ + echo "✅ Waypoint gateway programmed" || \ + echo "❌ Waypoint gateway not ready" + +# Check Istio labels +echo "Checking Istio ambient labels..." +kubectl get namespace $NAMESPACE -o jsonpath='{.metadata.labels.istio\.io/dataplane-mode}' | grep -q "ambient" && \ + echo "✅ Ambient mode enabled" || \ + echo "❌ Ambient mode not enabled" + +# Check pod container count +echo "Checking pod topology..." +CONTAINER_COUNT=$(kubectl get pods -n $NAMESPACE -l kagenti.io/type=agent \ + -o jsonpath='{.items[0].spec.containers[*].name}' | wc -w | tr -d ' ') +if [ "$CONTAINER_COUNT" -eq 1 ]; then + echo "✅ Single-container pods (waypoint mode)" +else + echo "❌ Multi-container pods (sidecar mode?)" +fi + +# Check client secret +echo "Checking client credentials..." +kubectl get secrets -n $NAMESPACE | grep -q kagenti-keycloak-client-credentials && \ + echo "✅ Client credential secret exists" || \ + echo "❌ Client credential secret missing" + +# Check admin secret NOT in namespace +echo "Checking security (no admin secret in agent namespace)..." +kubectl get secret keycloak-admin-secret -n $NAMESPACE 2>&1 | grep -q "NotFound" && \ + echo "✅ Admin secret NOT in agent namespace (secure)" || \ + echo "❌ Admin secret found in agent namespace (SECURITY ISSUE)" + +echo "=== Validation complete ===" +``` + +--- + +## Troubleshooting + +### Issue: Waypoint Gateway Not Created + +**Symptoms**: +- Namespace labeled but no gateway resource +- Operator logs show no reconciliation events + +**Diagnosis**: +```bash +# Check operator is running +kubectl get pods -n kagenti-system + +# Check operator logs +kubectl logs -n kagenti-system deployment/kagenti-controller-manager | grep waypoint + +# Verify operator flags +kubectl get deployment -n kagenti-system kagenti-controller-manager \ + -o jsonpath='{.spec.template.spec.containers[0].args}' | jq -r '.[]' +``` + +**Solution**: +1. Verify `--enable-waypoint-provisioning=true` flag set +2. Check operator RBAC has Gateway resource permissions +3. Restart operator if configuration changed + +### Issue: Pods Still Have Sidecars After Migration + +**Symptoms**: +- Pods show multiple containers after rolling update + +**Diagnosis**: +```bash +# Check pod spec +kubectl get pod -n $NAMESPACE -o yaml | grep -A 20 "containers:" + +# Check namespace labels +kubectl get namespace $NAMESPACE -o yaml | grep labels -A 10 +``` + +**Possible Causes**: +1. Istio sidecar injection still enabled (`istio-injection=enabled` label) +2. Pod template still has `kagenti.io/client-registration-inject: "true"` label +3. Webhook still configured for sidecar injection + +**Solution**: +```bash +# Remove sidecar injection labels +kubectl label namespace $NAMESPACE istio-injection- +kubectl patch deployment -n $NAMESPACE --type=json -p='[ + {"op": "remove", "path": "/spec/template/metadata/labels/kagenti.io~1client-registration-inject"} +]' + +# Force restart +kubectl rollout restart deployment -n $NAMESPACE +``` + +### Issue: Agent Can't Obtain Access Token + +**Symptoms**: +- Agent logs show "client credentials not found" +- HTTP 401 from Keycloak + +**Diagnosis**: +```bash +# Check if secret exists +kubectl get secrets -n $NAMESPACE | grep kagenti-keycloak-client-credentials + +# Check secret contents +SECRET_NAME=$(kubectl get secrets -n $NAMESPACE -o name | grep kagenti-keycloak-client-credentials | head -1) +kubectl get -n $NAMESPACE $SECRET_NAME -o jsonpath='{.data}' | jq '.' + +# Check if mounted in pod +kubectl get pod -n $NAMESPACE -o yaml | grep -A 10 volumeMounts +``` + +**Solution**: +1. Verify `keycloak-admin-secret` exists in kagenti-system +2. Check operator logs for client registration errors +3. Verify pod has `/shared` volume mount +4. Manually trigger secret recreation: + ```bash + kubectl delete secret -n $NAMESPACE $SECRET_NAME + kubectl rollout restart deployment -n $NAMESPACE + ``` + +### Issue: Cross-Namespace Communication Fails + +**Symptoms**: +- 503 errors when calling agents in other namespaces +- "upstream connect error" in waypoint logs + +**Diagnosis**: +```bash +# Check target namespace has waypoint +kubectl get gateway -n + +# Check target namespace Istio labels +kubectl get namespace -o jsonpath='{.metadata.labels}' | jq '.' + +# Check ztunnel logs +kubectl logs -n istio-system daemonset/ztunnel | grep +``` + +**Solution**: +1. Ensure target namespace also migrated to waypoint mode +2. Verify Istio ambient mesh enabled in both namespaces +3. Check network policies not blocking cross-namespace traffic +4. Test with token exchange for proper authorization + +--- + +## Best Practices + +1. **Migrate During Low-Traffic Windows**: Schedule migrations during maintenance windows to minimize user impact. + +2. **Monitor Closely**: Watch operator logs, waypoint gateway metrics, and application logs during migration. + +3. **Test Thoroughly in Staging**: Validate entire workflow (token acquisition, cross-namespace calls, token exchange) in staging before production. + +4. **Document Namespace State**: Keep records of which namespaces are sidecar vs waypoint mode during transition period. + +5. **Coordinate with Security Team**: Verify admin secret removal from agent namespaces aligns with security policies. + +6. **Update Runbooks**: Update incident response procedures to reflect waypoint mode architecture. + +--- + +## Additional Resources + +- [Waypoint Mode User Guide](./waypoint-mode.md) +- [Operator-Managed Client Registration](./operator-managed-client-registration.md) +- [Architecture Documentation](./architecture.md) +- [Istio Ambient Mesh Migration](https://istio.io/latest/docs/ambient/migrate-from-sidecar/) diff --git a/kagenti-operator/docs/waypoint-mode.md b/kagenti-operator/docs/waypoint-mode.md new file mode 100644 index 00000000..150d122d --- /dev/null +++ b/kagenti-operator/docs/waypoint-mode.md @@ -0,0 +1,974 @@ +# Waypoint Mode - Design and User Guide + +**Version**: 1.0 +**Status**: Production-Ready +**Last Updated**: 2026-04-03 + +## Table of Contents + +- [Overview](#overview) +- [Architecture](#architecture) +- [Design Principles](#design-principles) +- [Key Components](#key-components) +- [User Guide](#user-guide) +- [Configuration](#configuration) +- [Security Model](#security-model) +- [Performance Characteristics](#performance-characteristics) +- [Troubleshooting](#troubleshooting) +- [FAQ](#faq) + +--- + +## Overview + +**Waypoint Mode** is a deployment pattern for Kagenti agents that eliminates per-pod sidecars by centralizing L7 proxy and authentication logic in shared Istio waypoint gateways. This mode is the **default** for new agent deployments. + +### What is Waypoint Mode? + +In waypoint mode: +- **Agents deploy as single containers** (no sidecars) +- **L7 proxy shared per namespace** via Istio waypoint gateways +- **L4 mTLS handled by ztunnel** (Istio ambient mesh component) +- **Client credentials managed centrally** by the operator +- **Automatic infrastructure provisioning** (gateways, Istio config) + +### Benefits + +| Benefit | Description | +|---------|-------------| +| **Resource Efficiency** | 66% reduction in containers per pod vs sidecar mode | +| **Simplified Pod Topology** | Single container per agent pod | +| **Centralized Auth** | OAuth client credentials managed by operator | +| **Automatic Provisioning** | Zero manual configuration for waypoint gateways | +| **Security Isolation** | Admin credentials never exposed to agent namespaces | + +### Comparison: Waypoint vs Sidecar Mode + +| Aspect | Waypoint Mode | Sidecar Mode (Legacy) | +|--------|---------------|------------------------| +| Containers per pod | **1** (agent only) | 3+ (agent + envoy + spiffe-helper + client-registration) | +| L7 Proxy | Shared waypoint gateway (1 per namespace) | Per-pod envoy sidecar | +| L4 mTLS | ztunnel DaemonSet (Istio ambient) | envoy sidecar | +| Client Registration | Operator-managed (centralized) | In-pod sidecar or operator-managed | +| Istio Integration | Ambient mesh | Sidecar injection | +| Resource Overhead | Low (shared gateway) | High (per-pod sidecars) | +| Pod Startup Time | Fast (single container) | Slower (init containers + sidecars) | + +--- + +## Architecture + +### High-Level Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Kagenti Platform Operator (kagenti-system namespace) │ +│ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ NamespaceWaypointReconciler │ │ +│ │ - Watches Namespaces with kagenti.io/type=agent │ │ +│ │ - Provisions Istio Gateway resources │ │ +│ │ - Applies Istio ambient mesh labels │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ ClientRegistrationReconciler │ │ +│ │ - Watches agent Deployments/StatefulSets │ │ +│ │ - Registers OIDC clients in Keycloak │ │ +│ │ - Creates client credential secrets │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + │ Provisions & Manages + │ + ┌─────────────────────┴─────────────────────┐ + │ │ + ▼ ▼ +┌───────────────────────────────┐ ┌───────────────────────────────┐ +│ Agent Namespace (e.g., team1) │ │ Agent Namespace (e.g., team2) │ +│ │ │ │ +│ ┌───────────────────────────┐ │ │ ┌───────────────────────────┐ │ +│ │ Istio Waypoint Gateway │ │ │ │ Istio Waypoint Gateway │ │ +│ │ - L7 Envoy proxy │ │ │ │ - L7 Envoy proxy │ │ +│ │ - JWT validation │ │ │ │ - JWT validation │ │ +│ │ - Token exchange │ │ │ │ - Token exchange │ │ +│ │ - mTLS workload cert │ │ │ │ - mTLS workload cert │ │ +│ └───────────────────────────┘ │ │ └───────────────────────────┘ │ +│ │ │ │ │ │ +│ ▼ │ │ ▼ │ +│ ┌───────────────────────────┐ │ │ ┌───────────────────────────┐ │ +│ │ Agent Pod (1 container) │ │ │ │ Agent Pod (1 container) │ │ +│ │ - Agent application │ │ │ │ - Agent application │ │ +│ │ - OAuth client creds │ │ │ │ - OAuth client creds │ │ +│ │ (mounted from Secret) │ │ │ │ (mounted from Secret) │ │ +│ └───────────────────────────┘ │ │ └───────────────────────────┘ │ +│ │ │ │ +│ ┌───────────────────────────┐ │ │ ┌───────────────────────────┐ │ +│ │ Client Credentials Secret │ │ │ │ Client Credentials Secret │ │ +│ │ - client-id.txt │ │ │ │ - client-id.txt │ │ +│ │ - client-secret.txt │ │ │ │ - client-secret.txt │ │ +│ └───────────────────────────┘ │ │ └───────────────────────────┘ │ +│ │ │ │ +│ Namespace Labels: │ │ Namespace Labels: │ +│ istio-discovery: enabled │ │ istio-discovery: enabled │ +│ istio.io/dataplane-mode: │ │ istio.io/dataplane-mode: │ +│ ambient │ │ ambient │ +│ istio.io/use-waypoint: │ │ istio.io/use-waypoint: │ +│ team1-waypoint │ │ team2-waypoint │ +└───────────────────────────────┘ └───────────────────────────────┘ +``` + +### Data Flow: Agent-to-Agent Communication + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Step 1: Agent Obtains Access Token │ +└─────────────────────────────────────────────────────────────────┘ + +Agent Pod (team1/agent-a) + │ + │ 1. Read client credentials from mounted Secret + │ - client-id: team1/agent-a + │ - client-secret: + │ + ▼ + │ 2. Request access token from Keycloak + │ POST /realms/kagenti/protocol/openid-connect/token + │ grant_type=client_credentials + │ +Keycloak + │ + │ 3. Return JWT access token + │ - aud: [team2/agent-b, team3/agent-c, ...] + │ - azp: team1/agent-a + │ - exp: 300 (5 minutes) + │ + ▼ +Agent Pod (has access token) + +┌─────────────────────────────────────────────────────────────────┐ +│ Step 2: Cross-Namespace Call (team1/agent-a → team2/agent-b) │ +└─────────────────────────────────────────────────────────────────┘ + +Agent Pod (team1/agent-a) + │ + │ 4. HTTP Request with JWT + │ GET http://agent-b.team2.svc.cluster.local:8080/api/task + │ Authorization: Bearer + │ + ▼ +ztunnel (L4 mTLS - node-local DaemonSet) + │ + │ 5. L4 mTLS tunnel + │ Source: team1/agent-a + │ Dest: team1-waypoint + │ + ▼ +Waypoint Gateway (team1-waypoint) + │ + │ 6. L7 Processing + │ - Extract JWT from Authorization header + │ - Validate JWT signature (Keycloak JWKS) + │ - Check audience claim (must include team2/agent-b) + │ - Check expiry, issuer, etc. + │ - Optional: Exchange token for team2/agent-b audience + │ + ▼ L4 mTLS (cross-namespace) + │ +Waypoint Gateway (team2-waypoint) + │ + │ 7. Final L7 validation + │ - Re-validate JWT + │ - Check audience matches team2/agent-b + │ - Apply AuthorizationPolicy (if configured) + │ + ▼ L4 mTLS (in-namespace) + │ +ztunnel (team2 namespace) + │ + ▼ +Agent Pod (team2/agent-b) + │ + │ 8. Receive authenticated request + │ Headers include validated identity information +``` + +--- + +## Design Principles + +### 1. Zero-Configuration Deployment + +**Principle**: Agents should deploy with minimal configuration. Infrastructure provisioning should be automatic. + +**Implementation**: +- NamespaceWaypointReconciler watches namespaces with `kagenti.io/type: agent` label +- Automatically creates Istio Gateway resources +- Automatically applies Istio ambient mesh labels +- No manual `istioctl` commands required + +**Example**: +```yaml +# All you need is this label on the namespace +apiVersion: v1 +kind: Namespace +metadata: + name: team1 + labels: + kagenti.io/type: agent # Triggers automatic waypoint provisioning +``` + +### 2. Centralized Secret Management + +**Principle**: Admin credentials should never be exposed to agent namespaces. Client credentials should be managed by the operator. + +**Implementation**: +- `keycloak-admin-secret` exists ONLY in operator namespace (kagenti-system) +- ClientRegistrationReconciler reads admin secret from `r.OperatorNamespace` +- Agent namespaces receive only client credentials (client-id + client-secret) +- Secrets have owner references for automatic cleanup + +**Security Benefits**: +- Reduced attack surface (admin credentials in single namespace) +- Principle of least privilege (agents never see admin credentials) +- Simplified secret rotation (one secret to rotate instead of N) + +### 3. Resource Efficiency + +**Principle**: Minimize per-pod overhead. Share infrastructure where possible. + +**Implementation**: +- One waypoint gateway per namespace (shared by all agents) +- No per-pod envoy sidecars +- No per-pod spiffe-helper sidecars +- No per-pod client-registration sidecars + +**Resource Savings**: +- 66% reduction in containers per pod +- Reduced CPU/memory footprint +- Faster pod startup times + +### 4. Istio Ambient Mesh Integration + +**Principle**: Leverage Istio ambient mesh for L4 mTLS and waypoint gateway support. + +**Implementation**: +- `istio.io/dataplane-mode: ambient` enables Istio ambient mesh +- ztunnel DaemonSet handles L4 mTLS transparently +- Waypoint gateways handle L7 processing +- No sidecar injection required + +**Benefits**: +- Simpler pod topology +- Transparent L4 mTLS +- Centralized L7 policy enforcement + +--- + +## Key Components + +### NamespaceWaypointReconciler + +**Purpose**: Automatically provision waypoint gateways for namespaces with Kagenti agents. + +**Triggers**: +- Namespace with `kagenti.io/type: agent` label +- Pod created with `kagenti.io/type: agent` label in the namespace + +**Actions**: +1. Check if namespace has Kagenti workload pods +2. If yes, create Istio Gateway resource (if not exists) +3. Apply Istio ambient mesh labels to namespace: + - `istio-discovery: enabled` + - `istio.io/dataplane-mode: ambient` + - `istio.io/use-waypoint: -waypoint` + +**Gateway Specification**: +```yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: team1-waypoint + namespace: team1 +spec: + gatewayClassName: istio-waypoint + listeners: + - name: mesh + port: 15008 + protocol: HBONE +``` + +**Controller Configuration**: +```go +// cmd/main.go +flag.BoolVar(&enableWaypointProvisioning, "enable-waypoint-provisioning", false, + "Enable automatic waypoint gateway provisioning for namespaces with Kagenti agents") +``` + +**Typical Reconciliation Time**: ~20 seconds from agent pod creation to waypoint ready. + +### ClientRegistrationReconciler + +**Purpose**: Register agent workloads as OAuth clients in Keycloak and create credential secrets. + +**Triggers**: +- Deployment or StatefulSet with `kagenti.io/type: agent` label +- Label `kagenti.io/client-registration-inject` is NOT set to "true" (opt-out of operator management) + +**Actions**: +1. Read Keycloak configuration from `kagenti-operator-config` ConfigMap (or fallback to namespace `authbridge-config`) +2. Read `keycloak-admin-secret` from operator namespace (kagenti-system) +3. Register or fetch OIDC client in Keycloak: + - Client ID: `namespace/workload-name` (or SPIFFE ID if SPIRE enabled) + - Client auth type: `client-secret` + - Token exchange: enabled + - Audience scope: platform clients + configured audiences +4. Create/update Secret in agent namespace: + - Name: `kagenti-keycloak-client-credentials-` + - Keys: `client-id.txt`, `client-secret.txt` + - Owner reference: Deployment/StatefulSet (auto-deleted with workload) +5. Annotate pod template with secret name for webhook mounting + +**Secret Naming**: +```go +// Deterministic: SHA256 hash of namespace + workload name +func keycloakClientCredentialsSecretName(namespace, workload string) string { + sum := sha256.Sum256([]byte(namespace + "\x00" + workload + "\x00kagenti-keycloak-client-credentials")) + return "kagenti-keycloak-client-credentials-" + hex.EncodeToString(sum[:8]) +} +``` + +**Controller Configuration**: +```go +// cmd/main.go +flag.BoolVar(&enableOperatorClientRegistration, "enable-operator-client-registration", false, + "Enable operator-managed Keycloak client registration (default path)") +``` + +**Typical Reconciliation Time**: ~30 seconds from deployment creation to secret available. + +### Istio Waypoint Gateway + +**Purpose**: Shared L7 proxy for all agents in a namespace. + +**Responsibilities**: +- JWT validation (signature, expiry, audience, issuer) +- Token exchange (OAuth 2.0 RFC 8693) +- AuthorizationPolicy enforcement +- mTLS termination (workload certificates from Istio CA) +- Request routing to upstream services + +**Pod Specification** (managed by Istio): +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: team1-waypoint-f6d4d946-xxxxx + namespace: team1 + labels: + gateway.networking.k8s.io/gateway-name: team1-waypoint + istio.io/gateway-name: team1-waypoint +spec: + containers: + - name: istio-proxy + image: gcr.io/istio-release/proxyv2:1.24.0 + args: + - proxy + - waypoint + - --domain + - $(POD_NAMESPACE).svc.cluster.local + env: + - name: ISTIO_META_WAYPOINT_NAME + value: team1-waypoint +``` + +**Resource Requirements** (default): +- CPU: 100m request, 2000m limit +- Memory: 128Mi request, 1Gi limit + +**Scaling**: Horizontal Pod Autoscaler can be configured for high-traffic namespaces. + +### ztunnel (Istio Ambient Mesh) + +**Purpose**: L4 mTLS data plane for Istio ambient mesh. + +**Deployment**: DaemonSet (one pod per node) + +**Responsibilities**: +- Transparent L4 mTLS tunneling +- Traffic capture via iptables or eBPF +- Workload identity verification (SPIFFE SVIDs) +- Traffic routing to waypoint gateways + +**Configuration**: Managed by Istio control plane (istiod). + +--- + +## User Guide + +### Prerequisites + +1. **Istio Ambient Mesh Installed**: + ```bash + istioctl install --set profile=ambient --set values.pilot.env.PILOT_ENABLE_AMBIENT=true + ``` + +2. **Keycloak Deployed and Configured**: + - Realm created (e.g., `kagenti`) + - Admin credentials available + +3. **Kagenti Operator Deployed**: + ```bash + kubectl apply -f config/crd/bases/ + kubectl apply -f config/rbac/ + kubectl apply -f config/manager/ + ``` + +4. **Operator Configuration**: + ```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: kagenti-operator-config + namespace: kagenti-system + data: + KEYCLOAK_URL: https://keycloak.example.com + KEYCLOAK_REALM: kagenti + CLIENT_AUTH_TYPE: client-secret + KEYCLOAK_TOKEN_EXCHANGE_ENABLED: "true" + KEYCLOAK_AUDIENCE_SCOPE_ENABLED: "true" + PLATFORM_CLIENT_IDS: kagenti + SPIRE_ENABLED: "false" + ``` + +5. **Keycloak Admin Secret** (operator namespace only): + ```yaml + apiVersion: v1 + kind: Secret + metadata: + name: keycloak-admin-secret + namespace: kagenti-system + type: Opaque + stringData: + KEYCLOAK_ADMIN_USERNAME: admin + KEYCLOAK_ADMIN_PASSWORD: + ``` + +### Deploying an Agent in Waypoint Mode + +**Step 1**: Create namespace with agent label + +```yaml +apiVersion: v1 +kind: Namespace +metadata: + name: my-agents + labels: + kagenti.io/type: agent # Triggers waypoint provisioning +``` + +**Step 2**: Deploy your agent + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-agent + namespace: my-agents +spec: + replicas: 1 + selector: + matchLabels: + app: my-agent + template: + metadata: + labels: + app: my-agent + kagenti.io/type: agent # Required for operator discovery + kagenti.io/auth-mode: waypoint # Optional: documents intent + spec: + containers: + - name: agent + image: my-org/my-agent:latest + env: + - name: KEYCLOAK_URL + value: https://keycloak.example.com + - name: KEYCLOAK_REALM + value: kagenti + # Client credentials will be mounted by webhook at /shared/client-*.txt + volumeMounts: + - name: shared-data + mountPath: /shared + volumes: + - name: shared-data + emptyDir: {} +``` + +**Step 3**: Verify deployment + +```bash +# Check waypoint gateway created +kubectl get gateway -n my-agents + +# Check Istio labels applied +kubectl get namespace my-agents -o jsonpath='{.metadata.labels}' | jq '.' + +# Check client secret created +kubectl get secrets -n my-agents | grep kagenti-keycloak-client-credentials + +# Check pod has single container +kubectl get pod -n my-agents -l app=my-agent -o jsonpath='{.items[0].spec.containers[*].name}' +``` + +**Step 4**: Access client credentials in your agent + +```python +# Python example +import os + +def get_keycloak_credentials(): + """Read client credentials from mounted secret.""" + client_id = open('/shared/client-id.txt').read().strip() + client_secret = open('/shared/client-secret.txt').read().strip() + return client_id, client_secret + +def get_access_token(): + """Obtain JWT access token from Keycloak.""" + import requests + + client_id, client_secret = get_keycloak_credentials() + keycloak_url = os.getenv('KEYCLOAK_URL') + realm = os.getenv('KEYCLOAK_REALM') + + response = requests.post( + f"{keycloak_url}/realms/{realm}/protocol/openid-connect/token", + data={ + 'grant_type': 'client_credentials', + 'client_id': client_id, + 'client_secret': client_secret, + }, + headers={'Content-Type': 'application/x-www-form-urlencoded'} + ) + + return response.json()['access_token'] + +# Use the token +token = get_access_token() +headers = {'Authorization': f'Bearer {token}'} +response = requests.get('http://other-agent.other-ns.svc.cluster.local:8080/api/task', headers=headers) +``` + +### Token Exchange for Cross-Namespace Calls + +When calling an agent in a different namespace, exchange your token for the target audience: + +```python +def exchange_token(access_token, target_audience): + """Exchange token for specific audience using OAuth 2.0 Token Exchange (RFC 8693).""" + import requests + + client_id, client_secret = get_keycloak_credentials() + keycloak_url = os.getenv('KEYCLOAK_URL') + realm = os.getenv('KEYCLOAK_REALM') + + response = requests.post( + f"{keycloak_url}/realms/{realm}/protocol/openid-connect/token", + data={ + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange', + 'client_id': client_id, + 'client_secret': client_secret, + 'subject_token': access_token, + 'subject_token_type': 'urn:ietf:params:oauth:token-type:access_token', + 'audience': target_audience, + }, + headers={'Content-Type': 'application/x-www-form-urlencoded'} + ) + + return response.json()['access_token'] + +# Example: Call agent in different namespace +access_token = get_access_token() +target_audience = 'other-namespace/other-agent' +exchanged_token = exchange_token(access_token, target_audience) + +headers = {'Authorization': f'Bearer {exchanged_token}'} +response = requests.get('http://other-agent.other-namespace.svc.cluster.local:8080/api/task', headers=headers) +``` + +--- + +## Configuration + +### Operator Flags + +```bash +# cmd/main.go +--enable-waypoint-provisioning=true # Enable automatic waypoint provisioning (default: false) +--enable-operator-client-registration=true # Enable operator-managed client registration (default: false) +--operator-namespace=kagenti-system # Operator namespace for reading admin secrets +--spire-trust-domain=cluster.local # SPIRE trust domain (if SPIRE enabled) +``` + +### Opt-Out of Waypoint Mode + +To use legacy sidecar mode for specific workloads: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: legacy-agent + namespace: my-agents +spec: + template: + metadata: + labels: + kagenti.io/type: agent + kagenti.io/client-registration-inject: "true" # Opt into sidecar mode +``` + +This will: +- Disable operator-managed client registration +- Enable webhook injection of client-registration sidecar +- Use per-pod envoy sidecar instead of waypoint gateway + +### Namespace-Specific Configuration + +Override Keycloak config per namespace (fallback from operator config): + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: authbridge-config + namespace: my-agents +data: + KEYCLOAK_URL: https://keycloak.example.com + KEYCLOAK_REALM: my-custom-realm + CLIENT_AUTH_TYPE: client-secret + KEYCLOAK_TOKEN_EXCHANGE_ENABLED: "true" +``` + +Operator will prefer `kagenti-operator-config` from operator namespace, but fall back to namespace-local `authbridge-config` if operator config is incomplete. + +--- + +## Security Model + +### Admin Credential Isolation + +**Threat Model**: Compromised agent namespace should NOT expose Keycloak admin credentials. + +**Implementation**: +- `keycloak-admin-secret` exists ONLY in operator namespace (kagenti-system) +- Agent namespaces NEVER contain admin credentials +- ClientRegistrationReconciler reads from `r.OperatorNamespace` + +**Verification**: +```bash +# Admin secret should exist only here +kubectl get secret -n kagenti-system keycloak-admin-secret + +# Should fail (NotFound) +kubectl get secret -n my-agents keycloak-admin-secret +``` + +### Client Credential Lifecycle + +**Creation**: +- Operator creates secret with owner reference to Deployment/StatefulSet +- Secret automatically deleted when workload is deleted + +**Rotation**: +- Manual: Delete secret, operator will recreate with new credentials +- Automatic: Future enhancement (rotate on schedule) + +**Access Control**: +- Secret mounted read-only into agent pods +- RBAC: Only pods in the namespace can read the secret +- No cluster-wide secret access required + +### JWT Validation at Waypoint + +Waypoint gateways validate JWTs before routing: + +1. **Signature Verification**: RSA signature verified against Keycloak JWKS +2. **Expiry Check**: `exp` claim must be in the future +3. **Issuer Validation**: `iss` claim must match trusted Keycloak issuer +4. **Audience Validation**: `aud` claim must include target service +5. **Not-Before Check**: `nbf` claim (if present) must be in the past + +**Istio RequestAuthentication**: +```yaml +apiVersion: security.istio.io/v1 +kind: RequestAuthentication +metadata: + name: jwt-validation + namespace: my-agents +spec: + jwtRules: + - issuer: https://keycloak.example.com/realms/kagenti + jwksUri: https://keycloak.example.com/realms/kagenti/protocol/openid-connect/certs + audiences: + - my-agents/my-agent +``` + +**Istio AuthorizationPolicy**: +```yaml +apiVersion: security.istio.io/v1 +kind: AuthorizationPolicy +metadata: + name: require-jwt + namespace: my-agents +spec: + action: ALLOW + rules: + - from: + - source: + requestPrincipals: ["*"] + when: + - key: request.auth.claims[aud] + values: ["my-agents/my-agent"] +``` + +--- + +## Performance Characteristics + +### Resource Overhead + +**Waypoint Mode** (per namespace with 10 agents): +- Waypoint gateway pod: 1 +- Agent pods: 10 (1 container each) +- Total containers: 11 + +**Sidecar Mode** (per namespace with 10 agents): +- Agent pods: 10 (3 containers each: agent + envoy + spiffe-helper) +- Total containers: 30 + +**Savings**: 63% reduction in container count. + +### Latency Impact + +**L4 mTLS (ztunnel)**: +- Overhead: ~0.5ms (transparent TCP tunnel) +- CPU: Minimal (eBPF-based traffic capture) + +**L7 Proxy (waypoint)**: +- JWT validation: ~1-2ms (cached JWKS) +- Token exchange: ~100ms (roundtrip to Keycloak) +- Total L7 overhead: ~2-5ms (without token exchange) + +**Recommendation**: Cache access tokens and reuse until expiry (5 minutes default). + +### Scalability + +**Waypoint Gateway Scaling**: +- Default: 1 replica per namespace +- High-traffic namespaces: Use HorizontalPodAutoscaler + ```yaml + apiVersion: autoscaling/v2 + kind: HorizontalPodAutoscaler + metadata: + name: team1-waypoint-hpa + namespace: team1 + spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: team1-waypoint + minReplicas: 2 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + ``` + +**ztunnel Scaling**: +- DaemonSet: Scales with node count automatically +- No manual intervention required + +--- + +## Troubleshooting + +### Waypoint Gateway Not Created + +**Symptom**: Namespace has agents but no waypoint gateway. + +**Diagnosis**: +```bash +# Check namespace labels +kubectl get namespace my-agents -o yaml | grep labels -A 10 + +# Check operator logs +kubectl logs -n kagenti-system deployment/kagenti-controller-manager | grep waypoint + +# Check if waypoint provisioning enabled +kubectl get deployment -n kagenti-system kagenti-controller-manager -o yaml | grep enable-waypoint +``` + +**Solution**: +1. Ensure namespace has `kagenti.io/type: agent` label +2. Ensure operator has `--enable-waypoint-provisioning=true` flag +3. Check operator RBAC permissions for Gateway resources + +### Client Secret Not Created + +**Symptom**: Agent deployed but no `kagenti-keycloak-client-credentials-*` secret. + +**Diagnosis**: +```bash +# Check operator logs +kubectl logs -n kagenti-system deployment/kagenti-controller-manager | grep clientregistration + +# Check if keycloak-admin-secret exists +kubectl get secret -n kagenti-system keycloak-admin-secret + +# Check Keycloak config +kubectl get configmap -n kagenti-system kagenti-operator-config -o yaml +``` + +**Common Causes**: +1. Missing `keycloak-admin-secret` in kagenti-system +2. Incorrect `KEYCLOAK_URL` or `KEYCLOAK_REALM` in config +3. Keycloak admin credentials invalid (401 errors in logs) +4. Workload has `kagenti.io/client-registration-inject: "true"` (opt-out of operator management) + +**Solution**: +```bash +# Create admin secret if missing +kubectl create secret generic keycloak-admin-secret \ + -n kagenti-system \ + --from-literal=KEYCLOAK_ADMIN_USERNAME=admin \ + --from-literal=KEYCLOAK_ADMIN_PASSWORD= + +# Verify Keycloak connectivity +curl -X POST "https://keycloak.example.com/realms/kagenti/protocol/openid-connect/token" \ + -d "grant_type=password" \ + -d "client_id=admin-cli" \ + -d "username=admin" \ + -d "password=" +``` + +### Token Exchange Fails + +**Symptom**: `400 Bad Request: Requested audience not available: target-namespace/target-agent` + +**Diagnosis**: +```bash +# Decode JWT to see available audiences +TOKEN=$(cat /tmp/my_token.txt) +PAYLOAD=$(echo "$TOKEN" | cut -d'.' -f2) +echo "$PAYLOAD==" | base64 -d | jq '.aud' +``` + +**Root Cause**: Target audience not configured in Keycloak client scopes. + +**Solution**: +1. **Option A**: Use existing audience (check token's `aud` claim) +2. **Option B**: Configure Keycloak audience scopes: + - Navigate to Keycloak Admin Console + - Realms → kagenti → Client scopes + - Create audience scope for target agent + - Assign to source agent client + +**Future Enhancement**: Operator will automatically configure bidirectional audience scopes. + +### 503 Errors from Waypoint + +**Symptom**: `upstream connect error or disconnect/reset before headers` + +**Diagnosis**: +```bash +# Check waypoint logs +kubectl logs -n my-agents -l gateway.networking.k8s.io/gateway-name=my-agents-waypoint + +# Check target pod is running +kubectl get pods -n target-namespace -l app=target-agent + +# Check if target pod has listener on expected port +kubectl exec -n target-namespace deployment/target-agent -- netstat -tuln +``` + +**Common Causes**: +1. Target pod not running +2. Target pod has no HTTP server on expected port +3. Service port mismatch (Service port ≠ container port) + +**Solution**: +1. Ensure target pod has HTTP server listening +2. Verify Service port matches container port +3. Check Istio VirtualService / DestinationRule configuration (if custom routing) + +--- + +## FAQ + +### Q: Is waypoint mode the default? + +**A**: Yes, for new deployments. Waypoint mode is the default when: +- Operator has `--enable-waypoint-provisioning=true` +- Operator has `--enable-operator-client-registration=true` +- Workload does NOT have `kagenti.io/client-registration-inject: "true"` label + +Legacy sidecar mode is opt-in via the `kagenti.io/client-registration-inject: "true"` label. + +### Q: Can I mix waypoint and sidecar mode in the same cluster? + +**A**: Yes. Waypoint mode and sidecar mode can coexist: +- Waypoint mode: Namespaces with Istio ambient mesh + waypoint gateways +- Sidecar mode: Namespaces with Istio sidecar injection + +Just ensure the appropriate Istio configuration is applied per namespace. + +### Q: How do I migrate from sidecar to waypoint mode? + +**A**: See [Migration Guide](./migration-sidecar-to-waypoint.md) for step-by-step instructions. + +### Q: Does waypoint mode support SPIFFE/SPIRE? + +**A**: Yes. When `SPIRE_ENABLED=true` in configuration: +- Client IDs use SPIFFE format: `spiffe:///ns//sa/` +- Requires `--spire-trust-domain` flag on operator +- Requires dedicated ServiceAccount (not `default`) + +### Q: What happens if waypoint gateway pod crashes? + +**A**: Kubernetes will automatically restart the pod. During downtime: +- L4 mTLS still works (ztunnel) +- L7 requests will fail until waypoint recovers +- Consider deploying multiple waypoint replicas for HA + +### Q: How do I monitor waypoint gateways? + +**A**: Waypoint gateways expose Prometheus metrics: +```bash +kubectl port-forward -n my-agents deployment/my-agents-waypoint 15020:15020 +curl http://localhost:15020/stats/prometheus +``` + +Key metrics: +- `istio_requests_total`: Request count +- `istio_request_duration_milliseconds`: Latency +- `istio_request_bytes`: Request size +- `envoy_cluster_upstream_cx_connect_fail`: Connection failures + +### Q: Can I customize waypoint gateway resources? + +**A**: Currently, waypoint gateways use Istio defaults. Custom resource limits can be set via Istio configuration. Future enhancement: Allow per-namespace customization. + +--- + +## Additional Resources + +- [Architecture Documentation](./architecture.md) +- [Operator-Managed Client Registration](./operator-managed-client-registration.md) +- [Migration Guide: Sidecar to Waypoint](./migration-sidecar-to-waypoint.md) +- [Istio Ambient Mesh Documentation](https://istio.io/latest/docs/ambient/) +- [OAuth 2.0 Token Exchange (RFC 8693)](https://datatracker.ietf.org/doc/html/rfc8693) +- [Keycloak Documentation](https://www.keycloak.org/documentation) From 5f88ed7b45ad95b68d7f4b1619a2ea283eb80be5 Mon Sep 17 00:00:00 2001 From: Akram Date: Fri, 3 Apr 2026 19:21:25 +0200 Subject: [PATCH 4/4] docs: Add Mermaid.js diagrams for waypoint mode documentation Created visual diagrams to complement waypoint-mode.md: - waypoint-architecture.mmd: High-level system architecture - agent-communication-flow.mmd: Token exchange sequence diagram - operator-reconciliation.mmd: Controller reconciliation flows - security-architecture.mmd: Centralized secret management model - waypoint-vs-sidecar.mmd: Resource comparison between modes Assisted-By: Claude (Anthropic AI) Signed-off-by: Akram --- .../diagrams/agent-communication-flow.mmd | 49 +++++++++++ .../docs/diagrams/operator-reconciliation.mmd | 71 +++++++++++++++ .../docs/diagrams/security-architecture.mmd | 68 +++++++++++++++ .../docs/diagrams/waypoint-architecture.mmd | 86 +++++++++++++++++++ .../docs/diagrams/waypoint-vs-sidecar.mmd | 51 +++++++++++ 5 files changed, 325 insertions(+) create mode 100644 kagenti-operator/docs/diagrams/agent-communication-flow.mmd create mode 100644 kagenti-operator/docs/diagrams/operator-reconciliation.mmd create mode 100644 kagenti-operator/docs/diagrams/security-architecture.mmd create mode 100644 kagenti-operator/docs/diagrams/waypoint-architecture.mmd create mode 100644 kagenti-operator/docs/diagrams/waypoint-vs-sidecar.mmd diff --git a/kagenti-operator/docs/diagrams/agent-communication-flow.mmd b/kagenti-operator/docs/diagrams/agent-communication-flow.mmd new file mode 100644 index 00000000..ef619186 --- /dev/null +++ b/kagenti-operator/docs/diagrams/agent-communication-flow.mmd @@ -0,0 +1,49 @@ +%% Agent-to-Agent Communication Flow with Token Exchange +sequenceDiagram + participant A1 as Agent Pod
(team1/agent-a) + participant KC as Keycloak + participant Z1 as ztunnel
(team1 node) + participant W1 as Waypoint Gateway
(team1-waypoint) + participant W2 as Waypoint Gateway
(team2-waypoint) + participant Z2 as ztunnel
(team2 node) + participant A2 as Agent Pod
(team2/agent-b) + + Note over A1: Step 1: Obtain Access Token + A1->>A1: Read credentials from
/shared/client-id.txt
/shared/client-secret.txt + A1->>KC: POST /token
grant_type=client_credentials
client_id=team1/agent-a
client_secret= + KC-->>A1: JWT Access Token
aud: [team2/agent-b, ...]
azp: team1/agent-a
exp: 300s + + Note over A1,KC: Step 2: (Optional) Token Exchange + A1->>KC: POST /token
grant_type=token-exchange
subject_token=
audience=team2/agent-b + KC-->>A1: Exchanged JWT Token
aud: team2/agent-b
azp: team1/agent-a + + Note over A1,A2: Step 3: Cross-Namespace HTTP Request + A1->>Z1: HTTP GET /api/task
Host: agent-b.team2.svc.cluster.local
Authorization: Bearer + + Note over Z1: L4 mTLS Tunnel + Z1->>W1: mTLS encrypted traffic
Source: team1/agent-a + + Note over W1: L7 Processing + W1->>W1: 1. Extract JWT from Authorization header + W1->>W1: 2. Validate JWT signature (Keycloak JWKS) + W1->>W1: 3. Check exp, iss, aud claims + W1->>W1: 4. Verify audience includes team2/agent-b + + Note over W1,W2: Cross-Namespace L4 mTLS + W1->>W2: mTLS encrypted traffic
Validated request + + Note over W2: L7 Validation (Defense in Depth) + W2->>W2: 1. Re-validate JWT signature + W2->>W2: 2. Check audience matches team2/agent-b + W2->>W2: 3. Apply AuthorizationPolicy (if configured) + + W2->>Z2: Forward validated request + Z2->>A2: HTTP GET /api/task
Headers: x-forwarded-client-cert, etc. + + A2-->>Z2: HTTP 200 OK
Response data + Z2-->>W2: Response + W2-->>W1: Response + W1-->>Z1: Response + Z1-->>A1: HTTP 200 OK
Response data + + Note over A1,A2: ✅ Authenticated & Authorized Communication Complete diff --git a/kagenti-operator/docs/diagrams/operator-reconciliation.mmd b/kagenti-operator/docs/diagrams/operator-reconciliation.mmd new file mode 100644 index 00000000..4110c648 --- /dev/null +++ b/kagenti-operator/docs/diagrams/operator-reconciliation.mmd @@ -0,0 +1,71 @@ +%% Operator Reconciliation Flows +graph TB + subgraph "NamespaceWaypointReconciler" + NWR_WATCH[Watch Namespaces
& Pods] + NWR_CHECK{Has kagenti.io/type=agent
workload pods?} + NWR_GATEWAY{Gateway exists?} + NWR_CREATE[Create Gateway Resource] + NWR_LABELS{Istio labels applied?} + NWR_APPLY[Apply Istio Labels:
- istio-discovery: enabled
- istio.io/dataplane-mode: ambient
- istio.io/use-waypoint: -waypoint] + NWR_DONE[Reconcile Complete] + + NWR_WATCH --> NWR_CHECK + NWR_CHECK -->|Yes| NWR_GATEWAY + NWR_CHECK -->|No| NWR_DONE + NWR_GATEWAY -->|No| NWR_CREATE + NWR_GATEWAY -->|Yes| NWR_LABELS + NWR_CREATE --> NWR_LABELS + NWR_LABELS -->|No| NWR_APPLY + NWR_LABELS -->|Yes| NWR_DONE + NWR_APPLY --> NWR_DONE + end + + subgraph "ClientRegistrationReconciler" + CRR_WATCH[Watch Deployments
& StatefulSets] + CRR_FILTER{kagenti.io/type=agent
AND NOT client-registration-inject=true?} + CRR_GATES{Feature gates enabled?} + CRR_CONFIG[Read kagenti-operator-config
or authbridge-config] + CRR_ADMIN[Read keycloak-admin-secret
from operator namespace] + CRR_CLIENTID[Compute Client ID:
namespace/workload OR
spiffe://trust-domain/ns/.../sa/...] + CRR_REGISTER[Register/Fetch OIDC Client
in Keycloak] + CRR_AUDIENCE[Ensure Audience Scopes
for platform clients] + CRR_SECRET{Secret exists?} + CRR_CREATE_SEC[Create Secret:
kagenti-keycloak-client-credentials-] + CRR_UPDATE_SEC[Update Secret if changed] + CRR_ANNOTATE[Annotate Pod Template:
kagenti.io/keycloak-client-credentials-secret-name] + CRR_DONE[Reconcile Complete] + + CRR_WATCH --> CRR_FILTER + CRR_FILTER -->|No| CRR_DONE + CRR_FILTER -->|Yes| CRR_GATES + CRR_GATES -->|Disabled| CRR_DONE + CRR_GATES -->|Enabled| CRR_CONFIG + CRR_CONFIG --> CRR_ADMIN + CRR_ADMIN --> CRR_CLIENTID + CRR_CLIENTID --> CRR_REGISTER + CRR_REGISTER --> CRR_AUDIENCE + CRR_AUDIENCE --> CRR_SECRET + CRR_SECRET -->|No| CRR_CREATE_SEC + CRR_SECRET -->|Yes| CRR_UPDATE_SEC + CRR_CREATE_SEC --> CRR_ANNOTATE + CRR_UPDATE_SEC --> CRR_ANNOTATE + CRR_ANNOTATE --> CRR_DONE + end + + subgraph "External Systems" + KEYCLOAK[Keycloak
OIDC Provider] + K8S_API[Kubernetes API
Gateway, Secret, ConfigMap] + end + + NWR_CREATE -.creates.-> K8S_API + NWR_APPLY -.patches.-> K8S_API + CRR_REGISTER -.HTTP API.-> KEYCLOAK + CRR_AUDIENCE -.HTTP API.-> KEYCLOAK + CRR_CREATE_SEC -.creates.-> K8S_API + CRR_ANNOTATE -.patches.-> K8S_API + + style NWR_DONE fill:#bfb,stroke:#333,stroke-width:2px + style CRR_DONE fill:#bfb,stroke:#333,stroke-width:2px + style CRR_ADMIN fill:#f9f,stroke:#333,stroke-width:2px + style KEYCLOAK fill:#ffd,stroke:#333,stroke-width:2px + style K8S_API fill:#bbf,stroke:#333,stroke-width:2px diff --git a/kagenti-operator/docs/diagrams/security-architecture.mmd b/kagenti-operator/docs/diagrams/security-architecture.mmd new file mode 100644 index 00000000..4620b108 --- /dev/null +++ b/kagenti-operator/docs/diagrams/security-architecture.mmd @@ -0,0 +1,68 @@ +%% Security Architecture - Centralized Secrets +graph TB + subgraph "kagenti-system Namespace (Operator)" + OP[Kagenti Operator Pod] + ADMIN_SEC[🔒 keycloak-admin-secret
KEYCLOAK_ADMIN_USERNAME
KEYCLOAK_ADMIN_PASSWORD] + OP_CFG[kagenti-operator-config
KEYCLOAK_URL
KEYCLOAK_REALM
...] + + OP -->|reads| ADMIN_SEC + OP -->|reads| OP_CFG + end + + subgraph "Keycloak (External)" + KC_ADMIN[Admin REST API
/admin/realms/kagenti/clients] + KC_TOKEN[Token Endpoint
/realms/kagenti/protocol/openid-connect/token] + end + + subgraph "team1 Namespace (Agents)" + AGENT1[Agent Pod
team1/agent-a] + CLIENT_SEC1[🔑 Client Credentials Secret
client-id: team1/agent-a
client-secret: ] + + AGENT1 -->|mounts read-only| CLIENT_SEC1 + AGENT1 -.❌ NO ACCESS.-> ADMIN_SEC + end + + subgraph "team2 Namespace (Agents)" + AGENT2[Agent Pod
team2/agent-b] + CLIENT_SEC2[🔑 Client Credentials Secret
client-id: team2/agent-b
client-secret: ] + + AGENT2 -->|mounts read-only| CLIENT_SEC2 + AGENT2 -.❌ NO ACCESS.-> ADMIN_SEC + end + + subgraph "team3 Namespace (Agents)" + AGENT3[Agent Pod
team3/agent-c] + CLIENT_SEC3[🔑 Client Credentials Secret
client-id: team3/agent-c
client-secret: ] + + AGENT3 -->|mounts read-only| CLIENT_SEC3 + AGENT3 -.❌ NO ACCESS.-> ADMIN_SEC + end + + %% Operator uses admin credentials to manage clients + OP -->|authenticates with
admin credentials| KC_ADMIN + OP -->|registers OIDC clients| KC_ADMIN + OP -->|creates secrets| CLIENT_SEC1 + OP -->|creates secrets| CLIENT_SEC2 + OP -->|creates secrets| CLIENT_SEC3 + + %% Agents use client credentials for tokens + AGENT1 -->|client_credentials grant| KC_TOKEN + AGENT2 -->|client_credentials grant| KC_TOKEN + AGENT3 -->|client_credentials grant| KC_TOKEN + + %% Security boundaries + classDef adminSecret fill:#f9f,stroke:#900,stroke-width:3px + classDef clientSecret fill:#bfb,stroke:#090,stroke-width:2px + classDef noAccess stroke:#f00,stroke-dasharray: 5 5,stroke-width:2px + + class ADMIN_SEC adminSecret + class CLIENT_SEC1,CLIENT_SEC2,CLIENT_SEC3 clientSecret + + %% Annotations + note1[🔒 Admin Secret
- ONLY in operator namespace
- NEVER in agent namespaces
- Used ONLY by operator] + note2[🔑 Client Secrets
- One per agent workload
- Owner reference to workload
- Auto-deleted with workload] + note3[✅ Security Benefits
- Reduced attack surface
- Principle of least privilege
- Centralized rotation
- Simplified auditing] + + style note1 fill:#fff,stroke:#900,stroke-width:2px + style note2 fill:#fff,stroke:#090,stroke-width:2px + style note3 fill:#fff,stroke:#009,stroke-width:2px diff --git a/kagenti-operator/docs/diagrams/waypoint-architecture.mmd b/kagenti-operator/docs/diagrams/waypoint-architecture.mmd new file mode 100644 index 00000000..889974a6 --- /dev/null +++ b/kagenti-operator/docs/diagrams/waypoint-architecture.mmd @@ -0,0 +1,86 @@ +%% High-Level Waypoint Mode Architecture +graph TB + subgraph "Operator Namespace (kagenti-system)" + OP[kagenti-controller-manager] + NWRC[NamespaceWaypointReconciler] + CRC[ClientRegistrationReconciler] + OPSEC[keycloak-admin-secret
ADMIN CREDENTIALS] + OPCFG[kagenti-operator-config
KEYCLOAK_URL, REALM, etc.] + + OP --> NWRC + OP --> CRC + CRC --> OPSEC + CRC --> OPCFG + NWRC --> OPCFG + end + + subgraph "Keycloak (External)" + KC[Keycloak Server] + REALM[Realm: kagenti] + KC --> REALM + end + + subgraph "Agent Namespace: team1" + NS1[Namespace
labels: kagenti.io/type=agent] + GW1[Istio Gateway
team1-waypoint] + GWP1[Waypoint Pod
L7 Envoy Proxy] + + AGENT1[Agent Pod
1 container] + SEC1[Client Credentials Secret
client-id.txt
client-secret.txt] + + NS1 -.istio labels.-> GW1 + GW1 --> GWP1 + AGENT1 -.mounts.-> SEC1 + end + + subgraph "Agent Namespace: team2" + NS2[Namespace
labels: kagenti.io/type=agent] + GW2[Istio Gateway
team2-waypoint] + GWP2[Waypoint Pod
L7 Envoy Proxy] + + AGENT2[Agent Pod
1 container] + SEC2[Client Credentials Secret
client-id.txt
client-secret.txt] + + NS2 -.istio labels.-> GW2 + GW2 --> GWP2 + AGENT2 -.mounts.-> SEC2 + end + + subgraph "Istio Control Plane" + ISTIOD[istiod
Control Plane] + ZTUNNEL[ztunnel DaemonSet
L4 mTLS] + end + + %% Operator provisions infrastructure + NWRC -->|creates| GW1 + NWRC -->|creates| GW2 + NWRC -->|applies labels| NS1 + NWRC -->|applies labels| NS2 + + %% Operator manages client registration + CRC -->|registers OIDC client| KC + CRC -->|creates secret| SEC1 + CRC -->|creates secret| SEC2 + + %% Istio manages waypoint pods + ISTIOD -.manages.-> GWP1 + ISTIOD -.manages.-> GWP2 + ISTIOD -.manages.-> ZTUNNEL + + %% Data plane traffic + AGENT1 -->|L4 mTLS| ZTUNNEL + ZTUNNEL -->|routes to| GWP1 + GWP1 -->|L7 proxy| GWP2 + GWP2 -->|routes to| AGENT2 + + %% Authentication flow + AGENT1 -.obtains token.-> KC + AGENT2 -.obtains token.-> KC + + style OPSEC fill:#f9f,stroke:#333,stroke-width:2px + style OPCFG fill:#bbf,stroke:#333,stroke-width:2px + style SEC1 fill:#bfb,stroke:#333,stroke-width:2px + style SEC2 fill:#bfb,stroke:#333,stroke-width:2px + style GWP1 fill:#ffd,stroke:#333,stroke-width:2px + style GWP2 fill:#ffd,stroke:#333,stroke-width:2px + style ZTUNNEL fill:#ddf,stroke:#333,stroke-width:2px diff --git a/kagenti-operator/docs/diagrams/waypoint-vs-sidecar.mmd b/kagenti-operator/docs/diagrams/waypoint-vs-sidecar.mmd new file mode 100644 index 00000000..784adf3b --- /dev/null +++ b/kagenti-operator/docs/diagrams/waypoint-vs-sidecar.mmd @@ -0,0 +1,51 @@ +%% Waypoint Mode vs Sidecar Mode Comparison +graph TB + subgraph "Waypoint Mode (Default)" + subgraph "Namespace: team1" + WP_GW[Waypoint Gateway Pod
Shared L7 Proxy] + WP_POD1[Agent Pod 1
├─ agent container] + WP_POD2[Agent Pod 2
├─ agent container] + WP_POD3[Agent Pod 3
├─ agent container] + + WP_POD1 -.routes through.-> WP_GW + WP_POD2 -.routes through.-> WP_GW + WP_POD3 -.routes through.-> WP_GW + end + + WP_ZTUNNEL[ztunnel DaemonSet
L4 mTLS for all pods] + + WP_GW --> WP_ZTUNNEL + + WP_METRICS[📊 Resource Usage
3 agent pods = 4 containers total
- 3 agent containers
- 1 waypoint gateway

✅ 66% reduction vs sidecar] + end + + subgraph "Sidecar Mode (Legacy)" + subgraph "Namespace: team2" + SC_POD1[Agent Pod 1
├─ agent container
├─ envoy-proxy sidecar
├─ spiffe-helper sidecar
└─ client-registration sidecar] + SC_POD2[Agent Pod 2
├─ agent container
├─ envoy-proxy sidecar
├─ spiffe-helper sidecar
└─ client-registration sidecar] + SC_POD3[Agent Pod 3
├─ agent container
├─ envoy-proxy sidecar
├─ spiffe-helper sidecar
└─ client-registration sidecar] + end + + SC_METRICS[📊 Resource Usage
3 agent pods = 12 containers total
- 3 agent containers
- 3 envoy-proxy sidecars
- 3 spiffe-helper sidecars
- 3 client-registration sidecars

⚠️ High per-pod overhead] + end + + subgraph "Comparison Table" + TABLE[" + | Aspect | Waypoint | Sidecar | + |--------|----------|---------| + | Containers/pod | 1 | 4 | + | L7 Proxy | Shared | Per-pod | + | Client Reg | Operator | Sidecar | + | Resource Overhead | Low | High | + | Pod Startup | Fast | Slower | + "] + end + + style WP_GW fill:#ffd,stroke:#333,stroke-width:2px + style WP_ZTUNNEL fill:#ddf,stroke:#333,stroke-width:2px + style WP_METRICS fill:#bfb,stroke:#090,stroke-width:2px + style SC_POD1 fill:#fdd,stroke:#333,stroke-width:2px + style SC_POD2 fill:#fdd,stroke:#333,stroke-width:2px + style SC_POD3 fill:#fdd,stroke:#333,stroke-width:2px + style SC_METRICS fill:#fbb,stroke:#900,stroke-width:2px + style TABLE fill:#fff,stroke:#333,stroke-width:1px