diff --git a/kagenti-operator/cmd/main.go b/kagenti-operator/cmd/main.go
index f35eff05..9b846830 100644
--- a/kagenti-operator/cmd/main.go
+++ b/kagenti-operator/cmd/main.go
@@ -48,6 +48,7 @@ import (
"github.com/kagenti/operator/internal/signature"
"github.com/kagenti/operator/internal/tekton"
webhookv1alpha1 "github.com/kagenti/operator/internal/webhook/v1alpha1"
+ gwapiv1 "sigs.k8s.io/gateway-api/apis/v1"
// +kubebuilder:scaffold:imports
)
@@ -60,6 +61,7 @@ func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(agentv1alpha1.AddToScheme(scheme))
utilruntime.Must(tekton.AddToScheme(scheme))
+ utilruntime.Must(gwapiv1.Install(scheme))
// +kubebuilder:scaffold:scheme
}
@@ -78,7 +80,9 @@ func main() {
var signatureAuditMode bool
var enforceNetworkPolicies bool
var enableOperatorClientRegistration bool
+ var enableWaypointProvisioning bool
+ var operatorNamespace string
var spireTrustDomain string
var spireTrustBundleConfigMapName string
var spireTrustBundleConfigMapNS string
@@ -86,6 +90,8 @@ func main() {
var spireTrustBundleRefreshInterval time.Duration
var svidExpiryGracePeriod time.Duration
+ flag.StringVar(&operatorNamespace, "operator-namespace", os.Getenv("POD_NAMESPACE"),
+ "Namespace where the operator is running (default: POD_NAMESPACE env var, fallback: 'kagenti-system')")
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
@@ -112,6 +118,8 @@ func main() {
flag.BoolVar(&enableOperatorClientRegistration, "enable-operator-client-registration", false,
"Reconcile Keycloak client registration for agent/tool workloads unless "+
"kagenti.io/client-registration-inject=true (legacy sidecar)")
+ flag.BoolVar(&enableWaypointProvisioning, "enable-waypoint-provisioning", true,
+ "Automatically provision Istio waypoint gateways for namespaces with Kagenti workloads")
flag.StringVar(&spireTrustDomain, "spire-trust-domain", "",
"SPIRE trust domain for identity binding (e.g. 'example.org')")
@@ -134,6 +142,12 @@ func main() {
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
+ // Default operator namespace if not set
+ if operatorNamespace == "" {
+ operatorNamespace = "kagenti-system"
+ setupLog.Info("operator-namespace not set, using default", "namespace", operatorNamespace)
+ }
+
// Mitigate CVE-2023-44487 (HTTP/2 Rapid Reset).
disableHTTP2 := func(c *tls.Config) {
c.NextProtos = []string{"http/1.1"}
@@ -201,7 +215,10 @@ func main() {
Scheme: scheme,
Metrics: metricsServerOptions,
Cache: cache.Options{
- DefaultNamespaces: getNamespacesToWatch(),
+ // Note: DefaultNamespaces is intentionally not set (removed getNamespacesToWatch()).
+ // When not set, the cache defaults to cluster-wide for all resources except those
+ // explicitly scoped in ByObject below.
+ //
// Scope the ConfigMap informer to only kagenti-relevant ConfigMaps.
// Without this, the controller would cache ALL ConfigMaps cluster-wide.
//
@@ -230,6 +247,10 @@ func main() {
},
},
},
+ // NOTE: All other resources (Namespace, Pod, Deployment, StatefulSet, Gateway)
+ // are intentionally NOT in ByObject. With DefaultNamespaces not set, they will
+ // automatically use the default cluster-wide cache, which is what we want.
+ // Explicitly adding them to ByObject was preventing controllers from starting.
},
},
WebhookServer: webhookServer,
@@ -335,13 +356,15 @@ func main() {
Client: mgr.GetClient(),
APIReader: mgr.GetAPIReader(),
Scheme: mgr.GetScheme(),
+ OperatorNamespace: operatorNamespace,
SpireTrustDomain: spireTrustDomain,
KeycloakAdminTokenCache: &keycloak.CachedAdminTokenProvider{},
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "ClientRegistration")
os.Exit(1)
}
- setupLog.Info("Operator-managed client registration controller enabled")
+ setupLog.Info("Operator-managed client registration controller enabled",
+ "operatorNamespace", operatorNamespace)
}
if controller.TektonConfigCRDExists(mgr.GetConfig()) {
@@ -353,6 +376,18 @@ func main() {
}
}
+ if enableWaypointProvisioning {
+ if err = (&controller.NamespaceWaypointReconciler{
+ Client: mgr.GetClient(),
+ Scheme: mgr.GetScheme(),
+ EnableWaypointProvisioning: enableWaypointProvisioning,
+ }).SetupWithManager(mgr); err != nil {
+ setupLog.Error(err, "unable to create controller", "controller", "NamespaceWaypoint")
+ os.Exit(1)
+ }
+ setupLog.Info("Waypoint provisioning controller enabled")
+ }
+
if err = webhookv1alpha1.SetupAgentCardWebhookWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "AgentCard")
os.Exit(1)
diff --git a/kagenti-operator/docs/diagrams/agent-communication-flow.mmd b/kagenti-operator/docs/diagrams/agent-communication-flow.mmd
new file mode 100644
index 00000000..ef619186
--- /dev/null
+++ b/kagenti-operator/docs/diagrams/agent-communication-flow.mmd
@@ -0,0 +1,49 @@
+%% Agent-to-Agent Communication Flow with Token Exchange
+sequenceDiagram
+ participant A1 as Agent Pod
(team1/agent-a)
+ participant KC as Keycloak
+ participant Z1 as ztunnel
(team1 node)
+ participant W1 as Waypoint Gateway
(team1-waypoint)
+ participant W2 as Waypoint Gateway
(team2-waypoint)
+ participant Z2 as ztunnel
(team2 node)
+ participant A2 as Agent Pod
(team2/agent-b)
+
+ Note over A1: Step 1: Obtain Access Token
+ A1->>A1: Read credentials from
/shared/client-id.txt
/shared/client-secret.txt
+ A1->>KC: POST /token
grant_type=client_credentials
client_id=team1/agent-a
client_secret=
+ KC-->>A1: JWT Access Token
aud: [team2/agent-b, ...]
azp: team1/agent-a
exp: 300s
+
+ Note over A1,KC: Step 2: (Optional) Token Exchange
+ A1->>KC: POST /token
grant_type=token-exchange
subject_token=
audience=team2/agent-b
+ KC-->>A1: Exchanged JWT Token
aud: team2/agent-b
azp: team1/agent-a
+
+ Note over A1,A2: Step 3: Cross-Namespace HTTP Request
+ A1->>Z1: HTTP GET /api/task
Host: agent-b.team2.svc.cluster.local
Authorization: Bearer
+
+ Note over Z1: L4 mTLS Tunnel
+ Z1->>W1: mTLS encrypted traffic
Source: team1/agent-a
+
+ Note over W1: L7 Processing
+ W1->>W1: 1. Extract JWT from Authorization header
+ W1->>W1: 2. Validate JWT signature (Keycloak JWKS)
+ W1->>W1: 3. Check exp, iss, aud claims
+ W1->>W1: 4. Verify audience includes team2/agent-b
+
+ Note over W1,W2: Cross-Namespace L4 mTLS
+ W1->>W2: mTLS encrypted traffic
Validated request
+
+ Note over W2: L7 Validation (Defense in Depth)
+ W2->>W2: 1. Re-validate JWT signature
+ W2->>W2: 2. Check audience matches team2/agent-b
+ W2->>W2: 3. Apply AuthorizationPolicy (if configured)
+
+ W2->>Z2: Forward validated request
+ Z2->>A2: HTTP GET /api/task
Headers: x-forwarded-client-cert, etc.
+
+ A2-->>Z2: HTTP 200 OK
Response data
+ Z2-->>W2: Response
+ W2-->>W1: Response
+ W1-->>Z1: Response
+ Z1-->>A1: HTTP 200 OK
Response data
+
+ Note over A1,A2: β
Authenticated & Authorized Communication Complete
diff --git a/kagenti-operator/docs/diagrams/operator-reconciliation.mmd b/kagenti-operator/docs/diagrams/operator-reconciliation.mmd
new file mode 100644
index 00000000..4110c648
--- /dev/null
+++ b/kagenti-operator/docs/diagrams/operator-reconciliation.mmd
@@ -0,0 +1,71 @@
+%% Operator Reconciliation Flows
+graph TB
+ subgraph "NamespaceWaypointReconciler"
+ NWR_WATCH[Watch Namespaces
& Pods]
+ NWR_CHECK{Has kagenti.io/type=agent
workload pods?}
+ NWR_GATEWAY{Gateway exists?}
+ NWR_CREATE[Create Gateway Resource]
+ NWR_LABELS{Istio labels applied?}
+ NWR_APPLY[Apply Istio Labels:
- istio-discovery: enabled
- istio.io/dataplane-mode: ambient
- istio.io/use-waypoint: -waypoint]
+ NWR_DONE[Reconcile Complete]
+
+ NWR_WATCH --> NWR_CHECK
+ NWR_CHECK -->|Yes| NWR_GATEWAY
+ NWR_CHECK -->|No| NWR_DONE
+ NWR_GATEWAY -->|No| NWR_CREATE
+ NWR_GATEWAY -->|Yes| NWR_LABELS
+ NWR_CREATE --> NWR_LABELS
+ NWR_LABELS -->|No| NWR_APPLY
+ NWR_LABELS -->|Yes| NWR_DONE
+ NWR_APPLY --> NWR_DONE
+ end
+
+ subgraph "ClientRegistrationReconciler"
+ CRR_WATCH[Watch Deployments
& StatefulSets]
+ CRR_FILTER{kagenti.io/type=agent
AND NOT client-registration-inject=true?}
+ CRR_GATES{Feature gates enabled?}
+ CRR_CONFIG[Read kagenti-operator-config
or authbridge-config]
+ CRR_ADMIN[Read keycloak-admin-secret
from operator namespace]
+ CRR_CLIENTID[Compute Client ID:
namespace/workload OR
spiffe://trust-domain/ns/.../sa/...]
+ CRR_REGISTER[Register/Fetch OIDC Client
in Keycloak]
+ CRR_AUDIENCE[Ensure Audience Scopes
for platform clients]
+ CRR_SECRET{Secret exists?}
+ CRR_CREATE_SEC[Create Secret:
kagenti-keycloak-client-credentials-]
+ CRR_UPDATE_SEC[Update Secret if changed]
+ CRR_ANNOTATE[Annotate Pod Template:
kagenti.io/keycloak-client-credentials-secret-name]
+ CRR_DONE[Reconcile Complete]
+
+ CRR_WATCH --> CRR_FILTER
+ CRR_FILTER -->|No| CRR_DONE
+ CRR_FILTER -->|Yes| CRR_GATES
+ CRR_GATES -->|Disabled| CRR_DONE
+ CRR_GATES -->|Enabled| CRR_CONFIG
+ CRR_CONFIG --> CRR_ADMIN
+ CRR_ADMIN --> CRR_CLIENTID
+ CRR_CLIENTID --> CRR_REGISTER
+ CRR_REGISTER --> CRR_AUDIENCE
+ CRR_AUDIENCE --> CRR_SECRET
+ CRR_SECRET -->|No| CRR_CREATE_SEC
+ CRR_SECRET -->|Yes| CRR_UPDATE_SEC
+ CRR_CREATE_SEC --> CRR_ANNOTATE
+ CRR_UPDATE_SEC --> CRR_ANNOTATE
+ CRR_ANNOTATE --> CRR_DONE
+ end
+
+ subgraph "External Systems"
+ KEYCLOAK[Keycloak
OIDC Provider]
+ K8S_API[Kubernetes API
Gateway, Secret, ConfigMap]
+ end
+
+ NWR_CREATE -.creates.-> K8S_API
+ NWR_APPLY -.patches.-> K8S_API
+ CRR_REGISTER -.HTTP API.-> KEYCLOAK
+ CRR_AUDIENCE -.HTTP API.-> KEYCLOAK
+ CRR_CREATE_SEC -.creates.-> K8S_API
+ CRR_ANNOTATE -.patches.-> K8S_API
+
+ style NWR_DONE fill:#bfb,stroke:#333,stroke-width:2px
+ style CRR_DONE fill:#bfb,stroke:#333,stroke-width:2px
+ style CRR_ADMIN fill:#f9f,stroke:#333,stroke-width:2px
+ style KEYCLOAK fill:#ffd,stroke:#333,stroke-width:2px
+ style K8S_API fill:#bbf,stroke:#333,stroke-width:2px
diff --git a/kagenti-operator/docs/diagrams/security-architecture.mmd b/kagenti-operator/docs/diagrams/security-architecture.mmd
new file mode 100644
index 00000000..4620b108
--- /dev/null
+++ b/kagenti-operator/docs/diagrams/security-architecture.mmd
@@ -0,0 +1,68 @@
+%% Security Architecture - Centralized Secrets
+graph TB
+ subgraph "kagenti-system Namespace (Operator)"
+ OP[Kagenti Operator Pod]
+ ADMIN_SEC[π keycloak-admin-secret
KEYCLOAK_ADMIN_USERNAME
KEYCLOAK_ADMIN_PASSWORD]
+ OP_CFG[kagenti-operator-config
KEYCLOAK_URL
KEYCLOAK_REALM
...]
+
+ OP -->|reads| ADMIN_SEC
+ OP -->|reads| OP_CFG
+ end
+
+ subgraph "Keycloak (External)"
+ KC_ADMIN[Admin REST API
/admin/realms/kagenti/clients]
+ KC_TOKEN[Token Endpoint
/realms/kagenti/protocol/openid-connect/token]
+ end
+
+ subgraph "team1 Namespace (Agents)"
+ AGENT1[Agent Pod
team1/agent-a]
+ CLIENT_SEC1[π Client Credentials Secret
client-id: team1/agent-a
client-secret: ]
+
+ AGENT1 -->|mounts read-only| CLIENT_SEC1
+ AGENT1 -.β NO ACCESS.-> ADMIN_SEC
+ end
+
+ subgraph "team2 Namespace (Agents)"
+ AGENT2[Agent Pod
team2/agent-b]
+ CLIENT_SEC2[π Client Credentials Secret
client-id: team2/agent-b
client-secret: ]
+
+ AGENT2 -->|mounts read-only| CLIENT_SEC2
+ AGENT2 -.β NO ACCESS.-> ADMIN_SEC
+ end
+
+ subgraph "team3 Namespace (Agents)"
+ AGENT3[Agent Pod
team3/agent-c]
+ CLIENT_SEC3[π Client Credentials Secret
client-id: team3/agent-c
client-secret: ]
+
+ AGENT3 -->|mounts read-only| CLIENT_SEC3
+ AGENT3 -.β NO ACCESS.-> ADMIN_SEC
+ end
+
+ %% Operator uses admin credentials to manage clients
+ OP -->|authenticates with
admin credentials| KC_ADMIN
+ OP -->|registers OIDC clients| KC_ADMIN
+ OP -->|creates secrets| CLIENT_SEC1
+ OP -->|creates secrets| CLIENT_SEC2
+ OP -->|creates secrets| CLIENT_SEC3
+
+ %% Agents use client credentials for tokens
+ AGENT1 -->|client_credentials grant| KC_TOKEN
+ AGENT2 -->|client_credentials grant| KC_TOKEN
+ AGENT3 -->|client_credentials grant| KC_TOKEN
+
+ %% Security boundaries
+ classDef adminSecret fill:#f9f,stroke:#900,stroke-width:3px
+ classDef clientSecret fill:#bfb,stroke:#090,stroke-width:2px
+ classDef noAccess stroke:#f00,stroke-dasharray: 5 5,stroke-width:2px
+
+ class ADMIN_SEC adminSecret
+ class CLIENT_SEC1,CLIENT_SEC2,CLIENT_SEC3 clientSecret
+
+ %% Annotations
+ note1[π Admin Secret
- ONLY in operator namespace
- NEVER in agent namespaces
- Used ONLY by operator]
+ note2[π Client Secrets
- One per agent workload
- Owner reference to workload
- Auto-deleted with workload]
+ note3[β
Security Benefits
- Reduced attack surface
- Principle of least privilege
- Centralized rotation
- Simplified auditing]
+
+ style note1 fill:#fff,stroke:#900,stroke-width:2px
+ style note2 fill:#fff,stroke:#090,stroke-width:2px
+ style note3 fill:#fff,stroke:#009,stroke-width:2px
diff --git a/kagenti-operator/docs/diagrams/waypoint-architecture.mmd b/kagenti-operator/docs/diagrams/waypoint-architecture.mmd
new file mode 100644
index 00000000..889974a6
--- /dev/null
+++ b/kagenti-operator/docs/diagrams/waypoint-architecture.mmd
@@ -0,0 +1,86 @@
+%% High-Level Waypoint Mode Architecture
+graph TB
+ subgraph "Operator Namespace (kagenti-system)"
+ OP[kagenti-controller-manager]
+ NWRC[NamespaceWaypointReconciler]
+ CRC[ClientRegistrationReconciler]
+ OPSEC[keycloak-admin-secret
ADMIN CREDENTIALS]
+ OPCFG[kagenti-operator-config
KEYCLOAK_URL, REALM, etc.]
+
+ OP --> NWRC
+ OP --> CRC
+ CRC --> OPSEC
+ CRC --> OPCFG
+ NWRC --> OPCFG
+ end
+
+ subgraph "Keycloak (External)"
+ KC[Keycloak Server]
+ REALM[Realm: kagenti]
+ KC --> REALM
+ end
+
+ subgraph "Agent Namespace: team1"
+ NS1[Namespace
labels: kagenti.io/type=agent]
+ GW1[Istio Gateway
team1-waypoint]
+ GWP1[Waypoint Pod
L7 Envoy Proxy]
+
+ AGENT1[Agent Pod
1 container]
+ SEC1[Client Credentials Secret
client-id.txt
client-secret.txt]
+
+ NS1 -.istio labels.-> GW1
+ GW1 --> GWP1
+ AGENT1 -.mounts.-> SEC1
+ end
+
+ subgraph "Agent Namespace: team2"
+ NS2[Namespace
labels: kagenti.io/type=agent]
+ GW2[Istio Gateway
team2-waypoint]
+ GWP2[Waypoint Pod
L7 Envoy Proxy]
+
+ AGENT2[Agent Pod
1 container]
+ SEC2[Client Credentials Secret
client-id.txt
client-secret.txt]
+
+ NS2 -.istio labels.-> GW2
+ GW2 --> GWP2
+ AGENT2 -.mounts.-> SEC2
+ end
+
+ subgraph "Istio Control Plane"
+ ISTIOD[istiod
Control Plane]
+ ZTUNNEL[ztunnel DaemonSet
L4 mTLS]
+ end
+
+ %% Operator provisions infrastructure
+ NWRC -->|creates| GW1
+ NWRC -->|creates| GW2
+ NWRC -->|applies labels| NS1
+ NWRC -->|applies labels| NS2
+
+ %% Operator manages client registration
+ CRC -->|registers OIDC client| KC
+ CRC -->|creates secret| SEC1
+ CRC -->|creates secret| SEC2
+
+ %% Istio manages waypoint pods
+ ISTIOD -.manages.-> GWP1
+ ISTIOD -.manages.-> GWP2
+ ISTIOD -.manages.-> ZTUNNEL
+
+ %% Data plane traffic
+ AGENT1 -->|L4 mTLS| ZTUNNEL
+ ZTUNNEL -->|routes to| GWP1
+ GWP1 -->|L7 proxy| GWP2
+ GWP2 -->|routes to| AGENT2
+
+ %% Authentication flow
+ AGENT1 -.obtains token.-> KC
+ AGENT2 -.obtains token.-> KC
+
+ style OPSEC fill:#f9f,stroke:#333,stroke-width:2px
+ style OPCFG fill:#bbf,stroke:#333,stroke-width:2px
+ style SEC1 fill:#bfb,stroke:#333,stroke-width:2px
+ style SEC2 fill:#bfb,stroke:#333,stroke-width:2px
+ style GWP1 fill:#ffd,stroke:#333,stroke-width:2px
+ style GWP2 fill:#ffd,stroke:#333,stroke-width:2px
+ style ZTUNNEL fill:#ddf,stroke:#333,stroke-width:2px
diff --git a/kagenti-operator/docs/diagrams/waypoint-vs-sidecar.mmd b/kagenti-operator/docs/diagrams/waypoint-vs-sidecar.mmd
new file mode 100644
index 00000000..784adf3b
--- /dev/null
+++ b/kagenti-operator/docs/diagrams/waypoint-vs-sidecar.mmd
@@ -0,0 +1,51 @@
+%% Waypoint Mode vs Sidecar Mode Comparison
+graph TB
+ subgraph "Waypoint Mode (Default)"
+ subgraph "Namespace: team1"
+ WP_GW[Waypoint Gateway Pod
Shared L7 Proxy]
+ WP_POD1[Agent Pod 1
ββ agent container]
+ WP_POD2[Agent Pod 2
ββ agent container]
+ WP_POD3[Agent Pod 3
ββ agent container]
+
+ WP_POD1 -.routes through.-> WP_GW
+ WP_POD2 -.routes through.-> WP_GW
+ WP_POD3 -.routes through.-> WP_GW
+ end
+
+ WP_ZTUNNEL[ztunnel DaemonSet
L4 mTLS for all pods]
+
+ WP_GW --> WP_ZTUNNEL
+
+ WP_METRICS[π Resource Usage
3 agent pods = 4 containers total
- 3 agent containers
- 1 waypoint gateway
β
66% reduction vs sidecar]
+ end
+
+ subgraph "Sidecar Mode (Legacy)"
+ subgraph "Namespace: team2"
+ SC_POD1[Agent Pod 1
ββ agent container
ββ envoy-proxy sidecar
ββ spiffe-helper sidecar
ββ client-registration sidecar]
+ SC_POD2[Agent Pod 2
ββ agent container
ββ envoy-proxy sidecar
ββ spiffe-helper sidecar
ββ client-registration sidecar]
+ SC_POD3[Agent Pod 3
ββ agent container
ββ envoy-proxy sidecar
ββ spiffe-helper sidecar
ββ client-registration sidecar]
+ end
+
+ SC_METRICS[π Resource Usage
3 agent pods = 12 containers total
- 3 agent containers
- 3 envoy-proxy sidecars
- 3 spiffe-helper sidecars
- 3 client-registration sidecars
β οΈ High per-pod overhead]
+ end
+
+ subgraph "Comparison Table"
+ TABLE["
+ | Aspect | Waypoint | Sidecar |
+ |--------|----------|---------|
+ | Containers/pod | 1 | 4 |
+ | L7 Proxy | Shared | Per-pod |
+ | Client Reg | Operator | Sidecar |
+ | Resource Overhead | Low | High |
+ | Pod Startup | Fast | Slower |
+ "]
+ end
+
+ style WP_GW fill:#ffd,stroke:#333,stroke-width:2px
+ style WP_ZTUNNEL fill:#ddf,stroke:#333,stroke-width:2px
+ style WP_METRICS fill:#bfb,stroke:#090,stroke-width:2px
+ style SC_POD1 fill:#fdd,stroke:#333,stroke-width:2px
+ style SC_POD2 fill:#fdd,stroke:#333,stroke-width:2px
+ style SC_POD3 fill:#fdd,stroke:#333,stroke-width:2px
+ style SC_METRICS fill:#fbb,stroke:#900,stroke-width:2px
+ style TABLE fill:#fff,stroke:#333,stroke-width:1px
diff --git a/kagenti-operator/docs/migration-sidecar-to-waypoint.md b/kagenti-operator/docs/migration-sidecar-to-waypoint.md
new file mode 100644
index 00000000..5af07917
--- /dev/null
+++ b/kagenti-operator/docs/migration-sidecar-to-waypoint.md
@@ -0,0 +1,646 @@
+# Migration Guide: Sidecar Mode to Waypoint Mode
+
+**Version**: 1.0
+**Last Updated**: 2026-04-03
+**Audience**: Platform teams, DevOps engineers
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Prerequisites](#prerequisites)
+- [Migration Strategy](#migration-strategy)
+- [Step-by-Step Migration](#step-by-step-migration)
+- [Rollback Procedure](#rollback-procedure)
+- [Validation](#validation)
+- [Troubleshooting](#troubleshooting)
+
+---
+
+## Overview
+
+This guide provides instructions for migrating existing Kagenti agent deployments from **sidecar mode** to **waypoint mode**.
+
+### What Changes
+
+| Aspect | Sidecar Mode (Before) | Waypoint Mode (After) |
+|--------|----------------------|------------------------|
+| **Pod Topology** | 3+ containers (agent + envoy + spiffe-helper + client-registration) | 1 container (agent only) |
+| **L7 Proxy** | Per-pod envoy sidecar | Shared waypoint gateway (1 per namespace) |
+| **Client Registration** | In-pod sidecar OR operator-managed | Operator-managed (default) |
+| **Istio Integration** | Sidecar injection | Ambient mesh |
+| **Client Credentials** | Mounted from sidecar-created secret OR operator secret | Operator-managed secret |
+| **Namespace Config** | Istio sidecar injection label | Istio ambient mesh labels |
+
+### Benefits of Migration
+
+- **Resource Efficiency**: 66% reduction in containers per pod
+- **Simplified Operations**: No sidecar lifecycle management
+- **Centralized Auth**: Operator manages client credentials
+- **Faster Deployments**: Single-container pods start faster
+- **Security**: Admin credentials isolated to operator namespace
+
+---
+
+## Prerequisites
+
+### Cluster Requirements
+
+1. **Istio Ambient Mesh Installed**:
+ ```bash
+ # Verify Istio ambient components
+ kubectl get deployment -n istio-system istiod
+ kubectl get daemonset -n istio-system ztunnel
+ ```
+
+2. **Kagenti Operator Updated**:
+ - Minimum version: Includes NamespaceWaypointReconciler and operator-managed client registration
+ - Verify operator image includes waypoint support:
+ ```bash
+ kubectl get deployment -n kagenti-system kagenti-controller-manager \
+ -o jsonpath='{.spec.template.spec.containers[0].image}'
+ ```
+
+3. **Operator Configuration**:
+ ```yaml
+ # ConfigMap: kagenti-operator-config in kagenti-system
+ apiVersion: v1
+ kind: ConfigMap
+ metadata:
+ name: kagenti-operator-config
+ namespace: kagenti-system
+ data:
+ KEYCLOAK_URL: https://keycloak.example.com
+ KEYCLOAK_REALM: kagenti
+ CLIENT_AUTH_TYPE: client-secret
+ KEYCLOAK_TOKEN_EXCHANGE_ENABLED: "true"
+ KEYCLOAK_AUDIENCE_SCOPE_ENABLED: "true"
+ ```
+
+4. **Keycloak Admin Secret** (in operator namespace):
+ ```bash
+ kubectl get secret -n kagenti-system keycloak-admin-secret
+ ```
+
+### Agent Requirements
+
+1. **Agent Code Compatibility**: Agents must support reading client credentials from mounted files:
+ - `/shared/client-id.txt`
+ - `/shared/client-secret.txt`
+
+2. **Service Mesh Compatibility**: Agents must work with L4 mTLS (ztunnel) and L7 proxy (waypoint)
+
+---
+
+## Migration Strategy
+
+### Recommended Approach: Blue-Green Migration
+
+Migrate one namespace at a time to minimize risk:
+
+1. **Phase 1**: Test in staging namespace
+2. **Phase 2**: Migrate non-critical production namespaces
+3. **Phase 3**: Migrate critical production namespaces
+4. **Phase 4**: Decommission sidecar infrastructure
+
+### Timeline
+
+| Phase | Duration | Rollback Risk |
+|-------|----------|---------------|
+| Preparation | 1 hour | N/A |
+| Staging Test | 1-2 days | Low |
+| Non-Critical Production | 1 week | Low |
+| Critical Production | 2 weeks | Medium |
+| Cleanup | 1 week | Low |
+
+---
+
+## Step-by-Step Migration
+
+### Phase 1: Preparation
+
+**1.1 Enable Operator Features**
+
+Update operator deployment to enable waypoint provisioning and client registration:
+
+```bash
+kubectl patch deployment -n kagenti-system kagenti-controller-manager --type=json -p='[
+ {
+ "op": "add",
+ "path": "/spec/template/spec/containers/0/args/-",
+ "value": "--enable-waypoint-provisioning=true"
+ },
+ {
+ "op": "add",
+ "path": "/spec/template/spec/containers/0/args/-",
+ "value": "--enable-operator-client-registration=true"
+ }
+]'
+```
+
+**1.2 Verify Operator Configuration**
+
+```bash
+# Check operator flags
+kubectl get deployment -n kagenti-system kagenti-controller-manager \
+ -o jsonpath='{.spec.template.spec.containers[0].args}' | jq -r '.[]' | grep enable
+
+# Expected output:
+# --enable-waypoint-provisioning=true
+# --enable-operator-client-registration=true
+```
+
+**1.3 Create Staging Namespace**
+
+```bash
+kubectl apply -f - < backup-${TARGET_NAMESPACE}-namespace.yaml
+
+# Backup deployments
+kubectl get deployments -n $TARGET_NAMESPACE -o yaml > backup-${TARGET_NAMESPACE}-deployments.yaml
+
+# Backup secrets (if manually managed)
+kubectl get secrets -n $TARGET_NAMESPACE -o yaml > backup-${TARGET_NAMESPACE}-secrets.yaml
+```
+
+**3.3 Remove Sidecar Injection Label**
+
+```bash
+# Remove Istio sidecar injection label
+kubectl label namespace $TARGET_NAMESPACE istio-injection-
+
+# Add Kagenti agent type label (triggers waypoint provisioning)
+kubectl label namespace $TARGET_NAMESPACE kagenti.io/type=agent
+```
+
+**3.4 Update Agent Deployments**
+
+For each deployment in the namespace:
+
+```bash
+# Remove sidecar-specific labels/annotations
+kubectl patch deployment my-agent -n $TARGET_NAMESPACE --type=json -p='[
+ {
+ "op": "remove",
+ "path": "/spec/template/metadata/labels/kagenti.io~1client-registration-inject"
+ }
+]'
+
+# Add waypoint mode label (optional, for documentation)
+kubectl patch deployment my-agent -n $TARGET_NAMESPACE --type=json -p='[
+ {
+ "op": "add",
+ "path": "/spec/template/metadata/labels/kagenti.io~1auth-mode",
+ "value": "waypoint"
+ }
+]'
+```
+
+**3.5 Trigger Rolling Update**
+
+```bash
+# Force pod restart to remove sidecars
+kubectl rollout restart deployment -n $TARGET_NAMESPACE
+
+# Wait for rollout to complete
+kubectl rollout status deployment -n $TARGET_NAMESPACE --timeout=5m
+```
+
+**3.6 Verify Migration**
+
+```bash
+# Verify waypoint gateway created
+kubectl get gateway -n $TARGET_NAMESPACE
+
+# Verify Istio ambient labels
+kubectl get namespace $TARGET_NAMESPACE -o jsonpath='{.metadata.labels}' | jq '. | with_entries(select(.key | startswith("istio")))'
+
+# Verify pods have single container
+for deployment in $(kubectl get deployments -n $TARGET_NAMESPACE -o name); do
+ echo "Checking $deployment..."
+ kubectl get $deployment -n $TARGET_NAMESPACE \
+ -o jsonpath='{.spec.template.spec.containers[*].name}' && echo ""
+done
+
+# Verify client secrets created
+kubectl get secrets -n $TARGET_NAMESPACE | grep kagenti-keycloak-client-credentials
+```
+
+**3.7 Validate Agent Communication**
+
+```bash
+# Test intra-namespace communication
+POD_A=$(kubectl get pod -n $TARGET_NAMESPACE -l app=agent-a -o jsonpath='{.items[0].metadata.name}')
+kubectl exec -n $TARGET_NAMESPACE $POD_A -- \
+ curl -s http://agent-b.${TARGET_NAMESPACE}.svc.cluster.local:8080/health
+
+# Test cross-namespace communication
+kubectl exec -n $TARGET_NAMESPACE $POD_A -- \
+ curl -s http://agent-c.other-namespace.svc.cluster.local:8080/health
+```
+
+### Phase 4: Cleanup
+
+**4.1 Remove Legacy Secrets** (if applicable)
+
+```bash
+# List legacy client-registration secrets
+kubectl get secrets -n $TARGET_NAMESPACE | grep -E '(client-registration|sidecar)'
+
+# Delete if no longer needed
+kubectl delete secret legacy-client-registration-secret -n $TARGET_NAMESPACE
+```
+
+**4.2 Remove Per-Namespace Keycloak Admin Secrets** (IMPORTANT)
+
+```bash
+# SECURITY: Remove admin secrets from agent namespaces
+# (Only operator namespace should have keycloak-admin-secret)
+
+kubectl delete secret keycloak-admin-secret -n $TARGET_NAMESPACE --ignore-not-found
+
+# Verify only operator namespace has admin secret
+kubectl get secret keycloak-admin-secret -A
+# Should only show: kagenti-system/keycloak-admin-secret
+```
+
+**4.3 Update Monitoring/Alerts**
+
+- Update Prometheus queries to use waypoint gateway metrics
+- Update service mesh dashboards to show ambient mesh metrics
+- Remove sidecar-specific alerts (e.g., envoy_proxy_down)
+
+---
+
+## Rollback Procedure
+
+If issues occur during migration, follow these steps to rollback:
+
+### Quick Rollback (Restore Sidecar Mode)
+
+**1. Re-enable Sidecar Injection**
+
+```bash
+# Remove waypoint labels
+kubectl label namespace $TARGET_NAMESPACE kagenti.io/type-
+
+# Re-enable Istio sidecar injection
+kubectl label namespace $TARGET_NAMESPACE istio-injection=enabled
+```
+
+**2. Update Deployments**
+
+```bash
+# Add sidecar mode label
+kubectl patch deployment my-agent -n $TARGET_NAMESPACE --type=json -p='[
+ {
+ "op": "add",
+ "path": "/spec/template/metadata/labels/kagenti.io~1client-registration-inject",
+ "value": "true"
+ }
+]'
+
+# Trigger restart
+kubectl rollout restart deployment -n $TARGET_NAMESPACE
+```
+
+**3. Restore Secrets** (if needed)
+
+```bash
+# Restore from backup
+kubectl apply -f backup-${TARGET_NAMESPACE}-secrets.yaml
+```
+
+**4. Verify Rollback**
+
+```bash
+# Verify sidecars injected
+kubectl get pods -n $TARGET_NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[*].name}{"\n"}{end}'
+# Should show: agent, istio-proxy, spiffe-helper, etc.
+```
+
+---
+
+## Validation
+
+### Post-Migration Checklist
+
+- [ ] Waypoint gateway running and PROGRAMMED
+- [ ] Namespace has Istio ambient labels (`istio.io/dataplane-mode: ambient`)
+- [ ] Agent pods have single container (no sidecars)
+- [ ] Client credential secrets exist and have correct data
+- [ ] Agents can obtain access tokens from Keycloak
+- [ ] Intra-namespace communication working
+- [ ] Cross-namespace communication working
+- [ ] Token exchange working (if configured)
+- [ ] No Keycloak admin secrets in agent namespaces
+- [ ] Monitoring/alerts updated
+
+### Automated Validation Script
+
+```bash
+#!/bin/bash
+set -e
+
+NAMESPACE=$1
+
+if [ -z "$NAMESPACE" ]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+echo "=== Validating waypoint mode migration for $NAMESPACE ==="
+
+# Check waypoint gateway
+echo "Checking waypoint gateway..."
+kubectl get gateway -n $NAMESPACE ${NAMESPACE}-waypoint \
+ -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' | grep -q "True" && \
+ echo "β
Waypoint gateway programmed" || \
+ echo "β Waypoint gateway not ready"
+
+# Check Istio labels
+echo "Checking Istio ambient labels..."
+kubectl get namespace $NAMESPACE -o jsonpath='{.metadata.labels.istio\.io/dataplane-mode}' | grep -q "ambient" && \
+ echo "β
Ambient mode enabled" || \
+ echo "β Ambient mode not enabled"
+
+# Check pod container count
+echo "Checking pod topology..."
+CONTAINER_COUNT=$(kubectl get pods -n $NAMESPACE -l kagenti.io/type=agent \
+ -o jsonpath='{.items[0].spec.containers[*].name}' | wc -w | tr -d ' ')
+if [ "$CONTAINER_COUNT" -eq 1 ]; then
+ echo "β
Single-container pods (waypoint mode)"
+else
+ echo "β Multi-container pods (sidecar mode?)"
+fi
+
+# Check client secret
+echo "Checking client credentials..."
+kubectl get secrets -n $NAMESPACE | grep -q kagenti-keycloak-client-credentials && \
+ echo "β
Client credential secret exists" || \
+ echo "β Client credential secret missing"
+
+# Check admin secret NOT in namespace
+echo "Checking security (no admin secret in agent namespace)..."
+kubectl get secret keycloak-admin-secret -n $NAMESPACE 2>&1 | grep -q "NotFound" && \
+ echo "β
Admin secret NOT in agent namespace (secure)" || \
+ echo "β Admin secret found in agent namespace (SECURITY ISSUE)"
+
+echo "=== Validation complete ==="
+```
+
+---
+
+## Troubleshooting
+
+### Issue: Waypoint Gateway Not Created
+
+**Symptoms**:
+- Namespace labeled but no gateway resource
+- Operator logs show no reconciliation events
+
+**Diagnosis**:
+```bash
+# Check operator is running
+kubectl get pods -n kagenti-system
+
+# Check operator logs
+kubectl logs -n kagenti-system deployment/kagenti-controller-manager | grep waypoint
+
+# Verify operator flags
+kubectl get deployment -n kagenti-system kagenti-controller-manager \
+ -o jsonpath='{.spec.template.spec.containers[0].args}' | jq -r '.[]'
+```
+
+**Solution**:
+1. Verify `--enable-waypoint-provisioning=true` flag set
+2. Check operator RBAC has Gateway resource permissions
+3. Restart operator if configuration changed
+
+### Issue: Pods Still Have Sidecars After Migration
+
+**Symptoms**:
+- Pods show multiple containers after rolling update
+
+**Diagnosis**:
+```bash
+# Check pod spec
+kubectl get pod -n $NAMESPACE -o yaml | grep -A 20 "containers:"
+
+# Check namespace labels
+kubectl get namespace $NAMESPACE -o yaml | grep labels -A 10
+```
+
+**Possible Causes**:
+1. Istio sidecar injection still enabled (`istio-injection=enabled` label)
+2. Pod template still has `kagenti.io/client-registration-inject: "true"` label
+3. Webhook still configured for sidecar injection
+
+**Solution**:
+```bash
+# Remove sidecar injection labels
+kubectl label namespace $NAMESPACE istio-injection-
+kubectl patch deployment -n $NAMESPACE --type=json -p='[
+ {"op": "remove", "path": "/spec/template/metadata/labels/kagenti.io~1client-registration-inject"}
+]'
+
+# Force restart
+kubectl rollout restart deployment -n $NAMESPACE
+```
+
+### Issue: Agent Can't Obtain Access Token
+
+**Symptoms**:
+- Agent logs show "client credentials not found"
+- HTTP 401 from Keycloak
+
+**Diagnosis**:
+```bash
+# Check if secret exists
+kubectl get secrets -n $NAMESPACE | grep kagenti-keycloak-client-credentials
+
+# Check secret contents
+SECRET_NAME=$(kubectl get secrets -n $NAMESPACE -o name | grep kagenti-keycloak-client-credentials | head -1)
+kubectl get -n $NAMESPACE $SECRET_NAME -o jsonpath='{.data}' | jq '.'
+
+# Check if mounted in pod
+kubectl get pod -n $NAMESPACE -o yaml | grep -A 10 volumeMounts
+```
+
+**Solution**:
+1. Verify `keycloak-admin-secret` exists in kagenti-system
+2. Check operator logs for client registration errors
+3. Verify pod has `/shared` volume mount
+4. Manually trigger secret recreation:
+ ```bash
+ kubectl delete secret -n $NAMESPACE $SECRET_NAME
+ kubectl rollout restart deployment -n $NAMESPACE
+ ```
+
+### Issue: Cross-Namespace Communication Fails
+
+**Symptoms**:
+- 503 errors when calling agents in other namespaces
+- "upstream connect error" in waypoint logs
+
+**Diagnosis**:
+```bash
+# Check target namespace has waypoint
+kubectl get gateway -n
+
+# Check target namespace Istio labels
+kubectl get namespace -o jsonpath='{.metadata.labels}' | jq '.'
+
+# Check ztunnel logs
+kubectl logs -n istio-system daemonset/ztunnel | grep
+```
+
+**Solution**:
+1. Ensure target namespace also migrated to waypoint mode
+2. Verify Istio ambient mesh enabled in both namespaces
+3. Check network policies not blocking cross-namespace traffic
+4. Test with token exchange for proper authorization
+
+---
+
+## Best Practices
+
+1. **Migrate During Low-Traffic Windows**: Schedule migrations during maintenance windows to minimize user impact.
+
+2. **Monitor Closely**: Watch operator logs, waypoint gateway metrics, and application logs during migration.
+
+3. **Test Thoroughly in Staging**: Validate entire workflow (token acquisition, cross-namespace calls, token exchange) in staging before production.
+
+4. **Document Namespace State**: Keep records of which namespaces are sidecar vs waypoint mode during transition period.
+
+5. **Coordinate with Security Team**: Verify admin secret removal from agent namespaces aligns with security policies.
+
+6. **Update Runbooks**: Update incident response procedures to reflect waypoint mode architecture.
+
+---
+
+## Additional Resources
+
+- [Waypoint Mode User Guide](./waypoint-mode.md)
+- [Operator-Managed Client Registration](./operator-managed-client-registration.md)
+- [Architecture Documentation](./architecture.md)
+- [Istio Ambient Mesh Migration](https://istio.io/latest/docs/ambient/migrate-from-sidecar/)
diff --git a/kagenti-operator/docs/operator-managed-client-registration.md b/kagenti-operator/docs/operator-managed-client-registration.md
index 07c5500e..4a2840e5 100644
--- a/kagenti-operator/docs/operator-managed-client-registration.md
+++ b/kagenti-operator/docs/operator-managed-client-registration.md
@@ -32,7 +32,7 @@ The webhook continues to inject **proxy-init**, **envoy** / **authbridge**, and
### 1.3 Benefits
- **Fewer containers** when the sidecar path is not desired.
-- **Centralized registration** using namespace `keycloak-admin-secret` (already provisioned for the sidecar contract).
+- **Centralized registration** using operator namespace `keycloak-admin-secret` (stored securely in kagenti-system, not agent namespaces).
- **Deterministic secret naming** derived from namespace and workload name (`kagenti-keycloak-client-credentials-`), with **owner references** to the Deployment or StatefulSet.
- **Safe ordering**: the operator creates the Secret **before** setting the pod-template annotation, so new Pods do not reference a missing Secret.
- **Admission reinvocation**: the webhook uses `reinvocationPolicy: IfNeeded` so a second pass can add Secret volume mounts if the operator annotates the template **after** the first injection.
@@ -73,8 +73,8 @@ Other workloads are ignored by this controller.
### 2.4 Operator reconcile flow (simplified)
1. Read **cluster feature gates** (`kagenti-webhook` ConfigMap in the cluster defaults namespace). If `globalEnabled` or `clientRegistration` is false, skip.
-2. Read **`authbridge-config`** in the workload namespace (`KEYCLOAK_URL`, `KEYCLOAK_REALM`, `SPIRE_ENABLED`, etc.).
-3. Read **`keycloak-admin-secret`** (admin username/password).
+2. Read **`kagenti-operator-config`** from the operator namespace (kagenti-system) or fall back to **`authbridge-config`** in the workload namespace (`KEYCLOAK_URL`, `KEYCLOAK_REALM`, `SPIRE_ENABLED`, etc.).
+3. Read **`keycloak-admin-secret`** from the **operator namespace (kagenti-system)** - admin username/password for Keycloak API access.
4. Compute **Keycloak client ID**:
- If `SPIRE_ENABLED` is not true: `namespace/workloadName`.
- If SPIRE is enabled: `spiffe:///ns//sa/` (requires a **non-default** `serviceAccountName` and operator **`--spire-trust-domain`**).
@@ -97,8 +97,10 @@ Other workloads are ignored by this controller.
### 3.1 Platform / namespace
-- **`authbridge-config`** ConfigMap in the workload namespace with at least `KEYCLOAK_URL`, `KEYCLOAK_REALM`, and consistent `SPIRE_ENABLED` with the mesh.
-- **`keycloak-admin-secret`** in the same namespace with `KEYCLOAK_ADMIN_USERNAME` and `KEYCLOAK_ADMIN_PASSWORD`.
+- **`kagenti-operator-config`** ConfigMap in the **operator namespace (kagenti-system)** with at least `KEYCLOAK_URL`, `KEYCLOAK_REALM`, and `SPIRE_ENABLED` (waypoint mode, centralized config).
+ - Fallback: **`authbridge-config`** ConfigMap in the workload namespace (sidecar mode, backward compatibility).
+- **`keycloak-admin-secret`** in the **operator namespace (kagenti-system)** with `KEYCLOAK_ADMIN_USERNAME` and `KEYCLOAK_ADMIN_PASSWORD`.
+ - **Security**: This secret should ONLY exist in the operator namespace. Agent namespaces do not need access to Keycloak admin credentials.
- **Webhook** and **operator** versions that both implement this contract (deploy together).
### 3.2 Workload
@@ -109,8 +111,12 @@ Other workloads are ignored by this controller.
### 3.3 Operator configuration
-- When `authbridge-config` sets `SPIRE_ENABLED=true`, configure **`--spire-trust-domain`** to match the SPIRE server trust domain (same value as used for workload SPIFFE IDs).
-- Ensure the operator can read **`authbridge-config`** and **`keycloak-admin-secret`** in agent namespaces, and create/update **`kagenti-keycloak-client-credentials-*`** Secrets there (see RBAC below).
+- When `kagenti-operator-config` (or fallback `authbridge-config`) sets `SPIRE_ENABLED=true`, configure **`--spire-trust-domain`** to match the SPIRE server trust domain (same value as used for workload SPIFFE IDs).
+- Ensure the operator can:
+ - Read **`kagenti-operator-config`** and **`keycloak-admin-secret`** from the operator namespace (kagenti-system)
+ - Read **`authbridge-config`** from agent namespaces (fallback for backward compatibility)
+ - Create/update **`kagenti-keycloak-client-credentials-*`** Secrets in agent namespaces
+ - See RBAC section below for details
### 3.4 RBAC: why Secret rules are cluster-wide
@@ -122,7 +128,7 @@ That shape is intentional for this controller:
2. **Unknown agent namespaces at install time** β **ClientRegistration** reconciles **Deployments** and **StatefulSets** in **any** namespace where they match the label predicate. Platform teams add agent workloads and namespaces over time; the operator is not tied to a fixed list of namespaces configured when the ClusterRole is applied.
-3. **Data plane placement** β **`authbridge-config`** and **`keycloak-admin-secret`** live in the **workload namespace** (same contract as the webhook-injected sidecar). The controller must **Get** those Secrets (and **Create**/**Patch**/**Update** the derived credentials Secret) in that namespace on every reconcile. Without cluster-wide Secret permissions, every new agent namespace would require a coordinated RBAC update before reconciliation could succeed.
+3. **Split configuration placement** β **`kagenti-operator-config`** and **`keycloak-admin-secret`** live in the **operator namespace (kagenti-system)** for centralized waypoint mode. **`authbridge-config`** may exist in workload namespaces for backward compatibility. The controller must **Get** the admin secret from the operator namespace and **Create**/**Patch**/**Update** the derived client credentials Secret in agent namespaces. Without cluster-wide Secret permissions for creating client credentials, every new agent namespace would require a coordinated RBAC update before reconciliation could succeed.
4. **`list` / `watch`** β The kubebuilder marker generates **list** and **watch** alongside **get** for Secrets, consistent with other reconcilers in this project and with controller-runtimeβs usual expectation that the delegating client can sync or fall back to the API without ad-hoc verb subsets per resource.
@@ -149,11 +155,15 @@ Rolling webhook before operator can leave default workloads **without** registra
### 4.2 Operator-managed registration (default)
-1. Ensure the namespace has `authbridge-config` and `keycloak-admin-secret`.
+1. Ensure the **operator namespace (kagenti-system)** has:
+ - `kagenti-operator-config` ConfigMap with `KEYCLOAK_URL`, `KEYCLOAK_REALM`, etc.
+ - `keycloak-admin-secret` Secret with `KEYCLOAK_ADMIN_USERNAME` and `KEYCLOAK_ADMIN_PASSWORD`
2. Use normal agent/tool labels; **omit** `kagenti.io/client-registration-inject: "true"` unless you need the legacy sidecar.
3. If SPIRE is on, set a **dedicated** `serviceAccountName`.
4. **Restart** or roll the workload so the operator reconciles and the webhook applies Secret mounts (including on reinvocation).
+**Note**: Agent namespaces do NOT need `keycloak-admin-secret`. The operator reads this secret from its own namespace (kagenti-system) for all client registrations.
+
The operator will create or reuse the Keycloak client and Secret; the webhook will inject mounts on create or on reinvocation.
### 4.3 Rollback to legacy sidecar registration
diff --git a/kagenti-operator/docs/waypoint-mode.md b/kagenti-operator/docs/waypoint-mode.md
new file mode 100644
index 00000000..150d122d
--- /dev/null
+++ b/kagenti-operator/docs/waypoint-mode.md
@@ -0,0 +1,974 @@
+# Waypoint Mode - Design and User Guide
+
+**Version**: 1.0
+**Status**: Production-Ready
+**Last Updated**: 2026-04-03
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Architecture](#architecture)
+- [Design Principles](#design-principles)
+- [Key Components](#key-components)
+- [User Guide](#user-guide)
+- [Configuration](#configuration)
+- [Security Model](#security-model)
+- [Performance Characteristics](#performance-characteristics)
+- [Troubleshooting](#troubleshooting)
+- [FAQ](#faq)
+
+---
+
+## Overview
+
+**Waypoint Mode** is a deployment pattern for Kagenti agents that eliminates per-pod sidecars by centralizing L7 proxy and authentication logic in shared Istio waypoint gateways. This mode is the **default** for new agent deployments.
+
+### What is Waypoint Mode?
+
+In waypoint mode:
+- **Agents deploy as single containers** (no sidecars)
+- **L7 proxy shared per namespace** via Istio waypoint gateways
+- **L4 mTLS handled by ztunnel** (Istio ambient mesh component)
+- **Client credentials managed centrally** by the operator
+- **Automatic infrastructure provisioning** (gateways, Istio config)
+
+### Benefits
+
+| Benefit | Description |
+|---------|-------------|
+| **Resource Efficiency** | 66% reduction in containers per pod vs sidecar mode |
+| **Simplified Pod Topology** | Single container per agent pod |
+| **Centralized Auth** | OAuth client credentials managed by operator |
+| **Automatic Provisioning** | Zero manual configuration for waypoint gateways |
+| **Security Isolation** | Admin credentials never exposed to agent namespaces |
+
+### Comparison: Waypoint vs Sidecar Mode
+
+| Aspect | Waypoint Mode | Sidecar Mode (Legacy) |
+|--------|---------------|------------------------|
+| Containers per pod | **1** (agent only) | 3+ (agent + envoy + spiffe-helper + client-registration) |
+| L7 Proxy | Shared waypoint gateway (1 per namespace) | Per-pod envoy sidecar |
+| L4 mTLS | ztunnel DaemonSet (Istio ambient) | envoy sidecar |
+| Client Registration | Operator-managed (centralized) | In-pod sidecar or operator-managed |
+| Istio Integration | Ambient mesh | Sidecar injection |
+| Resource Overhead | Low (shared gateway) | High (per-pod sidecars) |
+| Pod Startup Time | Fast (single container) | Slower (init containers + sidecars) |
+
+---
+
+## Architecture
+
+### High-Level Architecture
+
+```
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+β Kagenti Platform Operator (kagenti-system namespace) β
+β β
+β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β NamespaceWaypointReconciler β β
+β β - Watches Namespaces with kagenti.io/type=agent β β
+β β - Provisions Istio Gateway resources β β
+β β - Applies Istio ambient mesh labels β β
+β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β
+β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+β β ClientRegistrationReconciler β β
+β β - Watches agent Deployments/StatefulSets β β
+β β - Registers OIDC clients in Keycloak β β
+β β - Creates client credential secrets β β
+β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+ β
+ β Provisions & Manages
+ β
+ βββββββββββββββββββββββ΄ββββββββββββββββββββββ
+ β β
+ βΌ βΌ
+βββββββββββββββββββββββββββββββββ βββββββββββββββββββββββββββββββββ
+β Agent Namespace (e.g., team1) β β Agent Namespace (e.g., team2) β
+β β β β
+β βββββββββββββββββββββββββββββ β β βββββββββββββββββββββββββββββ β
+β β Istio Waypoint Gateway β β β β Istio Waypoint Gateway β β
+β β - L7 Envoy proxy β β β β - L7 Envoy proxy β β
+β β - JWT validation β β β β - JWT validation β β
+β β - Token exchange β β β β - Token exchange β β
+β β - mTLS workload cert β β β β - mTLS workload cert β β
+β βββββββββββββββββββββββββββββ β β βββββββββββββββββββββββββββββ β
+β β β β β β
+β βΌ β β βΌ β
+β βββββββββββββββββββββββββββββ β β βββββββββββββββββββββββββββββ β
+β β Agent Pod (1 container) β β β β Agent Pod (1 container) β β
+β β - Agent application β β β β - Agent application β β
+β β - OAuth client creds β β β β - OAuth client creds β β
+β β (mounted from Secret) β β β β (mounted from Secret) β β
+β βββββββββββββββββββββββββββββ β β βββββββββββββββββββββββββββββ β
+β β β β
+β βββββββββββββββββββββββββββββ β β βββββββββββββββββββββββββββββ β
+β β Client Credentials Secret β β β β Client Credentials Secret β β
+β β - client-id.txt β β β β - client-id.txt β β
+β β - client-secret.txt β β β β - client-secret.txt β β
+β βββββββββββββββββββββββββββββ β β βββββββββββββββββββββββββββββ β
+β β β β
+β Namespace Labels: β β Namespace Labels: β
+β istio-discovery: enabled β β istio-discovery: enabled β
+β istio.io/dataplane-mode: β β istio.io/dataplane-mode: β
+β ambient β β ambient β
+β istio.io/use-waypoint: β β istio.io/use-waypoint: β
+β team1-waypoint β β team2-waypoint β
+βββββββββββββββββββββββββββββββββ βββββββββββββββββββββββββββββββββ
+```
+
+### Data Flow: Agent-to-Agent Communication
+
+```
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+β Step 1: Agent Obtains Access Token β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+Agent Pod (team1/agent-a)
+ β
+ β 1. Read client credentials from mounted Secret
+ β - client-id: team1/agent-a
+ β - client-secret:
+ β
+ βΌ
+ β 2. Request access token from Keycloak
+ β POST /realms/kagenti/protocol/openid-connect/token
+ β grant_type=client_credentials
+ β
+Keycloak
+ β
+ β 3. Return JWT access token
+ β - aud: [team2/agent-b, team3/agent-c, ...]
+ β - azp: team1/agent-a
+ β - exp: 300 (5 minutes)
+ β
+ βΌ
+Agent Pod (has access token)
+
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+β Step 2: Cross-Namespace Call (team1/agent-a β team2/agent-b) β
+βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+Agent Pod (team1/agent-a)
+ β
+ β 4. HTTP Request with JWT
+ β GET http://agent-b.team2.svc.cluster.local:8080/api/task
+ β Authorization: Bearer
+ β
+ βΌ
+ztunnel (L4 mTLS - node-local DaemonSet)
+ β
+ β 5. L4 mTLS tunnel
+ β Source: team1/agent-a
+ β Dest: team1-waypoint
+ β
+ βΌ
+Waypoint Gateway (team1-waypoint)
+ β
+ β 6. L7 Processing
+ β - Extract JWT from Authorization header
+ β - Validate JWT signature (Keycloak JWKS)
+ β - Check audience claim (must include team2/agent-b)
+ β - Check expiry, issuer, etc.
+ β - Optional: Exchange token for team2/agent-b audience
+ β
+ βΌ L4 mTLS (cross-namespace)
+ β
+Waypoint Gateway (team2-waypoint)
+ β
+ β 7. Final L7 validation
+ β - Re-validate JWT
+ β - Check audience matches team2/agent-b
+ β - Apply AuthorizationPolicy (if configured)
+ β
+ βΌ L4 mTLS (in-namespace)
+ β
+ztunnel (team2 namespace)
+ β
+ βΌ
+Agent Pod (team2/agent-b)
+ β
+ β 8. Receive authenticated request
+ β Headers include validated identity information
+```
+
+---
+
+## Design Principles
+
+### 1. Zero-Configuration Deployment
+
+**Principle**: Agents should deploy with minimal configuration. Infrastructure provisioning should be automatic.
+
+**Implementation**:
+- NamespaceWaypointReconciler watches namespaces with `kagenti.io/type: agent` label
+- Automatically creates Istio Gateway resources
+- Automatically applies Istio ambient mesh labels
+- No manual `istioctl` commands required
+
+**Example**:
+```yaml
+# All you need is this label on the namespace
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: team1
+ labels:
+ kagenti.io/type: agent # Triggers automatic waypoint provisioning
+```
+
+### 2. Centralized Secret Management
+
+**Principle**: Admin credentials should never be exposed to agent namespaces. Client credentials should be managed by the operator.
+
+**Implementation**:
+- `keycloak-admin-secret` exists ONLY in operator namespace (kagenti-system)
+- ClientRegistrationReconciler reads admin secret from `r.OperatorNamespace`
+- Agent namespaces receive only client credentials (client-id + client-secret)
+- Secrets have owner references for automatic cleanup
+
+**Security Benefits**:
+- Reduced attack surface (admin credentials in single namespace)
+- Principle of least privilege (agents never see admin credentials)
+- Simplified secret rotation (one secret to rotate instead of N)
+
+### 3. Resource Efficiency
+
+**Principle**: Minimize per-pod overhead. Share infrastructure where possible.
+
+**Implementation**:
+- One waypoint gateway per namespace (shared by all agents)
+- No per-pod envoy sidecars
+- No per-pod spiffe-helper sidecars
+- No per-pod client-registration sidecars
+
+**Resource Savings**:
+- 66% reduction in containers per pod
+- Reduced CPU/memory footprint
+- Faster pod startup times
+
+### 4. Istio Ambient Mesh Integration
+
+**Principle**: Leverage Istio ambient mesh for L4 mTLS and waypoint gateway support.
+
+**Implementation**:
+- `istio.io/dataplane-mode: ambient` enables Istio ambient mesh
+- ztunnel DaemonSet handles L4 mTLS transparently
+- Waypoint gateways handle L7 processing
+- No sidecar injection required
+
+**Benefits**:
+- Simpler pod topology
+- Transparent L4 mTLS
+- Centralized L7 policy enforcement
+
+---
+
+## Key Components
+
+### NamespaceWaypointReconciler
+
+**Purpose**: Automatically provision waypoint gateways for namespaces with Kagenti agents.
+
+**Triggers**:
+- Namespace with `kagenti.io/type: agent` label
+- Pod created with `kagenti.io/type: agent` label in the namespace
+
+**Actions**:
+1. Check if namespace has Kagenti workload pods
+2. If yes, create Istio Gateway resource (if not exists)
+3. Apply Istio ambient mesh labels to namespace:
+ - `istio-discovery: enabled`
+ - `istio.io/dataplane-mode: ambient`
+ - `istio.io/use-waypoint: -waypoint`
+
+**Gateway Specification**:
+```yaml
+apiVersion: gateway.networking.k8s.io/v1
+kind: Gateway
+metadata:
+ name: team1-waypoint
+ namespace: team1
+spec:
+ gatewayClassName: istio-waypoint
+ listeners:
+ - name: mesh
+ port: 15008
+ protocol: HBONE
+```
+
+**Controller Configuration**:
+```go
+// cmd/main.go
+flag.BoolVar(&enableWaypointProvisioning, "enable-waypoint-provisioning", false,
+ "Enable automatic waypoint gateway provisioning for namespaces with Kagenti agents")
+```
+
+**Typical Reconciliation Time**: ~20 seconds from agent pod creation to waypoint ready.
+
+### ClientRegistrationReconciler
+
+**Purpose**: Register agent workloads as OAuth clients in Keycloak and create credential secrets.
+
+**Triggers**:
+- Deployment or StatefulSet with `kagenti.io/type: agent` label
+- Label `kagenti.io/client-registration-inject` is NOT set to "true" (opt-out of operator management)
+
+**Actions**:
+1. Read Keycloak configuration from `kagenti-operator-config` ConfigMap (or fallback to namespace `authbridge-config`)
+2. Read `keycloak-admin-secret` from operator namespace (kagenti-system)
+3. Register or fetch OIDC client in Keycloak:
+ - Client ID: `namespace/workload-name` (or SPIFFE ID if SPIRE enabled)
+ - Client auth type: `client-secret`
+ - Token exchange: enabled
+ - Audience scope: platform clients + configured audiences
+4. Create/update Secret in agent namespace:
+ - Name: `kagenti-keycloak-client-credentials-`
+ - Keys: `client-id.txt`, `client-secret.txt`
+ - Owner reference: Deployment/StatefulSet (auto-deleted with workload)
+5. Annotate pod template with secret name for webhook mounting
+
+**Secret Naming**:
+```go
+// Deterministic: SHA256 hash of namespace + workload name
+func keycloakClientCredentialsSecretName(namespace, workload string) string {
+ sum := sha256.Sum256([]byte(namespace + "\x00" + workload + "\x00kagenti-keycloak-client-credentials"))
+ return "kagenti-keycloak-client-credentials-" + hex.EncodeToString(sum[:8])
+}
+```
+
+**Controller Configuration**:
+```go
+// cmd/main.go
+flag.BoolVar(&enableOperatorClientRegistration, "enable-operator-client-registration", false,
+ "Enable operator-managed Keycloak client registration (default path)")
+```
+
+**Typical Reconciliation Time**: ~30 seconds from deployment creation to secret available.
+
+### Istio Waypoint Gateway
+
+**Purpose**: Shared L7 proxy for all agents in a namespace.
+
+**Responsibilities**:
+- JWT validation (signature, expiry, audience, issuer)
+- Token exchange (OAuth 2.0 RFC 8693)
+- AuthorizationPolicy enforcement
+- mTLS termination (workload certificates from Istio CA)
+- Request routing to upstream services
+
+**Pod Specification** (managed by Istio):
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+ name: team1-waypoint-f6d4d946-xxxxx
+ namespace: team1
+ labels:
+ gateway.networking.k8s.io/gateway-name: team1-waypoint
+ istio.io/gateway-name: team1-waypoint
+spec:
+ containers:
+ - name: istio-proxy
+ image: gcr.io/istio-release/proxyv2:1.24.0
+ args:
+ - proxy
+ - waypoint
+ - --domain
+ - $(POD_NAMESPACE).svc.cluster.local
+ env:
+ - name: ISTIO_META_WAYPOINT_NAME
+ value: team1-waypoint
+```
+
+**Resource Requirements** (default):
+- CPU: 100m request, 2000m limit
+- Memory: 128Mi request, 1Gi limit
+
+**Scaling**: Horizontal Pod Autoscaler can be configured for high-traffic namespaces.
+
+### ztunnel (Istio Ambient Mesh)
+
+**Purpose**: L4 mTLS data plane for Istio ambient mesh.
+
+**Deployment**: DaemonSet (one pod per node)
+
+**Responsibilities**:
+- Transparent L4 mTLS tunneling
+- Traffic capture via iptables or eBPF
+- Workload identity verification (SPIFFE SVIDs)
+- Traffic routing to waypoint gateways
+
+**Configuration**: Managed by Istio control plane (istiod).
+
+---
+
+## User Guide
+
+### Prerequisites
+
+1. **Istio Ambient Mesh Installed**:
+ ```bash
+ istioctl install --set profile=ambient --set values.pilot.env.PILOT_ENABLE_AMBIENT=true
+ ```
+
+2. **Keycloak Deployed and Configured**:
+ - Realm created (e.g., `kagenti`)
+ - Admin credentials available
+
+3. **Kagenti Operator Deployed**:
+ ```bash
+ kubectl apply -f config/crd/bases/
+ kubectl apply -f config/rbac/
+ kubectl apply -f config/manager/
+ ```
+
+4. **Operator Configuration**:
+ ```yaml
+ apiVersion: v1
+ kind: ConfigMap
+ metadata:
+ name: kagenti-operator-config
+ namespace: kagenti-system
+ data:
+ KEYCLOAK_URL: https://keycloak.example.com
+ KEYCLOAK_REALM: kagenti
+ CLIENT_AUTH_TYPE: client-secret
+ KEYCLOAK_TOKEN_EXCHANGE_ENABLED: "true"
+ KEYCLOAK_AUDIENCE_SCOPE_ENABLED: "true"
+ PLATFORM_CLIENT_IDS: kagenti
+ SPIRE_ENABLED: "false"
+ ```
+
+5. **Keycloak Admin Secret** (operator namespace only):
+ ```yaml
+ apiVersion: v1
+ kind: Secret
+ metadata:
+ name: keycloak-admin-secret
+ namespace: kagenti-system
+ type: Opaque
+ stringData:
+ KEYCLOAK_ADMIN_USERNAME: admin
+ KEYCLOAK_ADMIN_PASSWORD:
+ ```
+
+### Deploying an Agent in Waypoint Mode
+
+**Step 1**: Create namespace with agent label
+
+```yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: my-agents
+ labels:
+ kagenti.io/type: agent # Triggers waypoint provisioning
+```
+
+**Step 2**: Deploy your agent
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: my-agent
+ namespace: my-agents
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: my-agent
+ template:
+ metadata:
+ labels:
+ app: my-agent
+ kagenti.io/type: agent # Required for operator discovery
+ kagenti.io/auth-mode: waypoint # Optional: documents intent
+ spec:
+ containers:
+ - name: agent
+ image: my-org/my-agent:latest
+ env:
+ - name: KEYCLOAK_URL
+ value: https://keycloak.example.com
+ - name: KEYCLOAK_REALM
+ value: kagenti
+ # Client credentials will be mounted by webhook at /shared/client-*.txt
+ volumeMounts:
+ - name: shared-data
+ mountPath: /shared
+ volumes:
+ - name: shared-data
+ emptyDir: {}
+```
+
+**Step 3**: Verify deployment
+
+```bash
+# Check waypoint gateway created
+kubectl get gateway -n my-agents
+
+# Check Istio labels applied
+kubectl get namespace my-agents -o jsonpath='{.metadata.labels}' | jq '.'
+
+# Check client secret created
+kubectl get secrets -n my-agents | grep kagenti-keycloak-client-credentials
+
+# Check pod has single container
+kubectl get pod -n my-agents -l app=my-agent -o jsonpath='{.items[0].spec.containers[*].name}'
+```
+
+**Step 4**: Access client credentials in your agent
+
+```python
+# Python example
+import os
+
+def get_keycloak_credentials():
+ """Read client credentials from mounted secret."""
+ client_id = open('/shared/client-id.txt').read().strip()
+ client_secret = open('/shared/client-secret.txt').read().strip()
+ return client_id, client_secret
+
+def get_access_token():
+ """Obtain JWT access token from Keycloak."""
+ import requests
+
+ client_id, client_secret = get_keycloak_credentials()
+ keycloak_url = os.getenv('KEYCLOAK_URL')
+ realm = os.getenv('KEYCLOAK_REALM')
+
+ response = requests.post(
+ f"{keycloak_url}/realms/{realm}/protocol/openid-connect/token",
+ data={
+ 'grant_type': 'client_credentials',
+ 'client_id': client_id,
+ 'client_secret': client_secret,
+ },
+ headers={'Content-Type': 'application/x-www-form-urlencoded'}
+ )
+
+ return response.json()['access_token']
+
+# Use the token
+token = get_access_token()
+headers = {'Authorization': f'Bearer {token}'}
+response = requests.get('http://other-agent.other-ns.svc.cluster.local:8080/api/task', headers=headers)
+```
+
+### Token Exchange for Cross-Namespace Calls
+
+When calling an agent in a different namespace, exchange your token for the target audience:
+
+```python
+def exchange_token(access_token, target_audience):
+ """Exchange token for specific audience using OAuth 2.0 Token Exchange (RFC 8693)."""
+ import requests
+
+ client_id, client_secret = get_keycloak_credentials()
+ keycloak_url = os.getenv('KEYCLOAK_URL')
+ realm = os.getenv('KEYCLOAK_REALM')
+
+ response = requests.post(
+ f"{keycloak_url}/realms/{realm}/protocol/openid-connect/token",
+ data={
+ 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange',
+ 'client_id': client_id,
+ 'client_secret': client_secret,
+ 'subject_token': access_token,
+ 'subject_token_type': 'urn:ietf:params:oauth:token-type:access_token',
+ 'audience': target_audience,
+ },
+ headers={'Content-Type': 'application/x-www-form-urlencoded'}
+ )
+
+ return response.json()['access_token']
+
+# Example: Call agent in different namespace
+access_token = get_access_token()
+target_audience = 'other-namespace/other-agent'
+exchanged_token = exchange_token(access_token, target_audience)
+
+headers = {'Authorization': f'Bearer {exchanged_token}'}
+response = requests.get('http://other-agent.other-namespace.svc.cluster.local:8080/api/task', headers=headers)
+```
+
+---
+
+## Configuration
+
+### Operator Flags
+
+```bash
+# cmd/main.go
+--enable-waypoint-provisioning=true # Enable automatic waypoint provisioning (default: false)
+--enable-operator-client-registration=true # Enable operator-managed client registration (default: false)
+--operator-namespace=kagenti-system # Operator namespace for reading admin secrets
+--spire-trust-domain=cluster.local # SPIRE trust domain (if SPIRE enabled)
+```
+
+### Opt-Out of Waypoint Mode
+
+To use legacy sidecar mode for specific workloads:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: legacy-agent
+ namespace: my-agents
+spec:
+ template:
+ metadata:
+ labels:
+ kagenti.io/type: agent
+ kagenti.io/client-registration-inject: "true" # Opt into sidecar mode
+```
+
+This will:
+- Disable operator-managed client registration
+- Enable webhook injection of client-registration sidecar
+- Use per-pod envoy sidecar instead of waypoint gateway
+
+### Namespace-Specific Configuration
+
+Override Keycloak config per namespace (fallback from operator config):
+
+```yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: authbridge-config
+ namespace: my-agents
+data:
+ KEYCLOAK_URL: https://keycloak.example.com
+ KEYCLOAK_REALM: my-custom-realm
+ CLIENT_AUTH_TYPE: client-secret
+ KEYCLOAK_TOKEN_EXCHANGE_ENABLED: "true"
+```
+
+Operator will prefer `kagenti-operator-config` from operator namespace, but fall back to namespace-local `authbridge-config` if operator config is incomplete.
+
+---
+
+## Security Model
+
+### Admin Credential Isolation
+
+**Threat Model**: Compromised agent namespace should NOT expose Keycloak admin credentials.
+
+**Implementation**:
+- `keycloak-admin-secret` exists ONLY in operator namespace (kagenti-system)
+- Agent namespaces NEVER contain admin credentials
+- ClientRegistrationReconciler reads from `r.OperatorNamespace`
+
+**Verification**:
+```bash
+# Admin secret should exist only here
+kubectl get secret -n kagenti-system keycloak-admin-secret
+
+# Should fail (NotFound)
+kubectl get secret -n my-agents keycloak-admin-secret
+```
+
+### Client Credential Lifecycle
+
+**Creation**:
+- Operator creates secret with owner reference to Deployment/StatefulSet
+- Secret automatically deleted when workload is deleted
+
+**Rotation**:
+- Manual: Delete secret, operator will recreate with new credentials
+- Automatic: Future enhancement (rotate on schedule)
+
+**Access Control**:
+- Secret mounted read-only into agent pods
+- RBAC: Only pods in the namespace can read the secret
+- No cluster-wide secret access required
+
+### JWT Validation at Waypoint
+
+Waypoint gateways validate JWTs before routing:
+
+1. **Signature Verification**: RSA signature verified against Keycloak JWKS
+2. **Expiry Check**: `exp` claim must be in the future
+3. **Issuer Validation**: `iss` claim must match trusted Keycloak issuer
+4. **Audience Validation**: `aud` claim must include target service
+5. **Not-Before Check**: `nbf` claim (if present) must be in the past
+
+**Istio RequestAuthentication**:
+```yaml
+apiVersion: security.istio.io/v1
+kind: RequestAuthentication
+metadata:
+ name: jwt-validation
+ namespace: my-agents
+spec:
+ jwtRules:
+ - issuer: https://keycloak.example.com/realms/kagenti
+ jwksUri: https://keycloak.example.com/realms/kagenti/protocol/openid-connect/certs
+ audiences:
+ - my-agents/my-agent
+```
+
+**Istio AuthorizationPolicy**:
+```yaml
+apiVersion: security.istio.io/v1
+kind: AuthorizationPolicy
+metadata:
+ name: require-jwt
+ namespace: my-agents
+spec:
+ action: ALLOW
+ rules:
+ - from:
+ - source:
+ requestPrincipals: ["*"]
+ when:
+ - key: request.auth.claims[aud]
+ values: ["my-agents/my-agent"]
+```
+
+---
+
+## Performance Characteristics
+
+### Resource Overhead
+
+**Waypoint Mode** (per namespace with 10 agents):
+- Waypoint gateway pod: 1
+- Agent pods: 10 (1 container each)
+- Total containers: 11
+
+**Sidecar Mode** (per namespace with 10 agents):
+- Agent pods: 10 (3 containers each: agent + envoy + spiffe-helper)
+- Total containers: 30
+
+**Savings**: 63% reduction in container count.
+
+### Latency Impact
+
+**L4 mTLS (ztunnel)**:
+- Overhead: ~0.5ms (transparent TCP tunnel)
+- CPU: Minimal (eBPF-based traffic capture)
+
+**L7 Proxy (waypoint)**:
+- JWT validation: ~1-2ms (cached JWKS)
+- Token exchange: ~100ms (roundtrip to Keycloak)
+- Total L7 overhead: ~2-5ms (without token exchange)
+
+**Recommendation**: Cache access tokens and reuse until expiry (5 minutes default).
+
+### Scalability
+
+**Waypoint Gateway Scaling**:
+- Default: 1 replica per namespace
+- High-traffic namespaces: Use HorizontalPodAutoscaler
+ ```yaml
+ apiVersion: autoscaling/v2
+ kind: HorizontalPodAutoscaler
+ metadata:
+ name: team1-waypoint-hpa
+ namespace: team1
+ spec:
+ scaleTargetRef:
+ apiVersion: apps/v1
+ kind: Deployment
+ name: team1-waypoint
+ minReplicas: 2
+ maxReplicas: 10
+ metrics:
+ - type: Resource
+ resource:
+ name: cpu
+ target:
+ type: Utilization
+ averageUtilization: 70
+ ```
+
+**ztunnel Scaling**:
+- DaemonSet: Scales with node count automatically
+- No manual intervention required
+
+---
+
+## Troubleshooting
+
+### Waypoint Gateway Not Created
+
+**Symptom**: Namespace has agents but no waypoint gateway.
+
+**Diagnosis**:
+```bash
+# Check namespace labels
+kubectl get namespace my-agents -o yaml | grep labels -A 10
+
+# Check operator logs
+kubectl logs -n kagenti-system deployment/kagenti-controller-manager | grep waypoint
+
+# Check if waypoint provisioning enabled
+kubectl get deployment -n kagenti-system kagenti-controller-manager -o yaml | grep enable-waypoint
+```
+
+**Solution**:
+1. Ensure namespace has `kagenti.io/type: agent` label
+2. Ensure operator has `--enable-waypoint-provisioning=true` flag
+3. Check operator RBAC permissions for Gateway resources
+
+### Client Secret Not Created
+
+**Symptom**: Agent deployed but no `kagenti-keycloak-client-credentials-*` secret.
+
+**Diagnosis**:
+```bash
+# Check operator logs
+kubectl logs -n kagenti-system deployment/kagenti-controller-manager | grep clientregistration
+
+# Check if keycloak-admin-secret exists
+kubectl get secret -n kagenti-system keycloak-admin-secret
+
+# Check Keycloak config
+kubectl get configmap -n kagenti-system kagenti-operator-config -o yaml
+```
+
+**Common Causes**:
+1. Missing `keycloak-admin-secret` in kagenti-system
+2. Incorrect `KEYCLOAK_URL` or `KEYCLOAK_REALM` in config
+3. Keycloak admin credentials invalid (401 errors in logs)
+4. Workload has `kagenti.io/client-registration-inject: "true"` (opt-out of operator management)
+
+**Solution**:
+```bash
+# Create admin secret if missing
+kubectl create secret generic keycloak-admin-secret \
+ -n kagenti-system \
+ --from-literal=KEYCLOAK_ADMIN_USERNAME=admin \
+ --from-literal=KEYCLOAK_ADMIN_PASSWORD=
+
+# Verify Keycloak connectivity
+curl -X POST "https://keycloak.example.com/realms/kagenti/protocol/openid-connect/token" \
+ -d "grant_type=password" \
+ -d "client_id=admin-cli" \
+ -d "username=admin" \
+ -d "password="
+```
+
+### Token Exchange Fails
+
+**Symptom**: `400 Bad Request: Requested audience not available: target-namespace/target-agent`
+
+**Diagnosis**:
+```bash
+# Decode JWT to see available audiences
+TOKEN=$(cat /tmp/my_token.txt)
+PAYLOAD=$(echo "$TOKEN" | cut -d'.' -f2)
+echo "$PAYLOAD==" | base64 -d | jq '.aud'
+```
+
+**Root Cause**: Target audience not configured in Keycloak client scopes.
+
+**Solution**:
+1. **Option A**: Use existing audience (check token's `aud` claim)
+2. **Option B**: Configure Keycloak audience scopes:
+ - Navigate to Keycloak Admin Console
+ - Realms β kagenti β Client scopes
+ - Create audience scope for target agent
+ - Assign to source agent client
+
+**Future Enhancement**: Operator will automatically configure bidirectional audience scopes.
+
+### 503 Errors from Waypoint
+
+**Symptom**: `upstream connect error or disconnect/reset before headers`
+
+**Diagnosis**:
+```bash
+# Check waypoint logs
+kubectl logs -n my-agents -l gateway.networking.k8s.io/gateway-name=my-agents-waypoint
+
+# Check target pod is running
+kubectl get pods -n target-namespace -l app=target-agent
+
+# Check if target pod has listener on expected port
+kubectl exec -n target-namespace deployment/target-agent -- netstat -tuln
+```
+
+**Common Causes**:
+1. Target pod not running
+2. Target pod has no HTTP server on expected port
+3. Service port mismatch (Service port β container port)
+
+**Solution**:
+1. Ensure target pod has HTTP server listening
+2. Verify Service port matches container port
+3. Check Istio VirtualService / DestinationRule configuration (if custom routing)
+
+---
+
+## FAQ
+
+### Q: Is waypoint mode the default?
+
+**A**: Yes, for new deployments. Waypoint mode is the default when:
+- Operator has `--enable-waypoint-provisioning=true`
+- Operator has `--enable-operator-client-registration=true`
+- Workload does NOT have `kagenti.io/client-registration-inject: "true"` label
+
+Legacy sidecar mode is opt-in via the `kagenti.io/client-registration-inject: "true"` label.
+
+### Q: Can I mix waypoint and sidecar mode in the same cluster?
+
+**A**: Yes. Waypoint mode and sidecar mode can coexist:
+- Waypoint mode: Namespaces with Istio ambient mesh + waypoint gateways
+- Sidecar mode: Namespaces with Istio sidecar injection
+
+Just ensure the appropriate Istio configuration is applied per namespace.
+
+### Q: How do I migrate from sidecar to waypoint mode?
+
+**A**: See [Migration Guide](./migration-sidecar-to-waypoint.md) for step-by-step instructions.
+
+### Q: Does waypoint mode support SPIFFE/SPIRE?
+
+**A**: Yes. When `SPIRE_ENABLED=true` in configuration:
+- Client IDs use SPIFFE format: `spiffe:///ns//sa/`
+- Requires `--spire-trust-domain` flag on operator
+- Requires dedicated ServiceAccount (not `default`)
+
+### Q: What happens if waypoint gateway pod crashes?
+
+**A**: Kubernetes will automatically restart the pod. During downtime:
+- L4 mTLS still works (ztunnel)
+- L7 requests will fail until waypoint recovers
+- Consider deploying multiple waypoint replicas for HA
+
+### Q: How do I monitor waypoint gateways?
+
+**A**: Waypoint gateways expose Prometheus metrics:
+```bash
+kubectl port-forward -n my-agents deployment/my-agents-waypoint 15020:15020
+curl http://localhost:15020/stats/prometheus
+```
+
+Key metrics:
+- `istio_requests_total`: Request count
+- `istio_request_duration_milliseconds`: Latency
+- `istio_request_bytes`: Request size
+- `envoy_cluster_upstream_cx_connect_fail`: Connection failures
+
+### Q: Can I customize waypoint gateway resources?
+
+**A**: Currently, waypoint gateways use Istio defaults. Custom resource limits can be set via Istio configuration. Future enhancement: Allow per-namespace customization.
+
+---
+
+## Additional Resources
+
+- [Architecture Documentation](./architecture.md)
+- [Operator-Managed Client Registration](./operator-managed-client-registration.md)
+- [Migration Guide: Sidecar to Waypoint](./migration-sidecar-to-waypoint.md)
+- [Istio Ambient Mesh Documentation](https://istio.io/latest/docs/ambient/)
+- [OAuth 2.0 Token Exchange (RFC 8693)](https://datatracker.ietf.org/doc/html/rfc8693)
+- [Keycloak Documentation](https://www.keycloak.org/documentation)
diff --git a/kagenti-operator/go.mod b/kagenti-operator/go.mod
index 19c31643..8ffc6823 100644
--- a/kagenti-operator/go.mod
+++ b/kagenti-operator/go.mod
@@ -17,6 +17,7 @@ require (
k8s.io/client-go v0.32.0
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738
sigs.k8s.io/controller-runtime v0.20.0
+ sigs.k8s.io/gateway-api v1.2.1
sigs.k8s.io/yaml v1.4.0
)
@@ -30,8 +31,7 @@ require (
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
- github.com/emicklei/go-restful/v3 v3.11.0 // indirect
- github.com/evanphx/json-patch v4.12.0+incompatible // indirect
+ github.com/emicklei/go-restful/v3 v3.12.0 // indirect
github.com/evanphx/json-patch/v5 v5.9.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
@@ -40,7 +40,7 @@ require (
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
- github.com/go-openapi/jsonreference v0.20.2 // indirect
+ github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
diff --git a/kagenti-operator/go.sum b/kagenti-operator/go.sum
index aec43282..31212b79 100644
--- a/kagenti-operator/go.sum
+++ b/kagenti-operator/go.sum
@@ -15,15 +15,14 @@ github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyY
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
-github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
-github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
-github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84=
-github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
+github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk=
+github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
+github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI=
+github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg=
github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
@@ -41,12 +40,10 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
-github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
-github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
-github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
-github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
+github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
+github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4=
github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
@@ -81,11 +78,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
-github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
-github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
-github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
@@ -255,6 +249,8 @@ sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcp
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
sigs.k8s.io/controller-runtime v0.20.0 h1:jjkMo29xEXH+02Md9qaVXfEIaMESSpy3TBWPrsfQkQs=
sigs.k8s.io/controller-runtime v0.20.0/go.mod h1:BrP3w158MwvB3ZbNpaAcIKkHQ7YGpYnzpoSTZ8E14WU=
+sigs.k8s.io/gateway-api v1.2.1 h1:fZZ/+RyRb+Y5tGkwxFKuYuSRQHu9dZtbjenblleOLHM=
+sigs.k8s.io/gateway-api v1.2.1/go.mod h1:EpNfEXNjiYfUJypf0eZ0P5iXA9ekSGWaS1WgPaM42X0=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aNqRlpuvjmwA=
diff --git a/kagenti-operator/internal/controller/clientregistration_controller.go b/kagenti-operator/internal/controller/clientregistration_controller.go
index 66aa1818..ac26130c 100644
--- a/kagenti-operator/internal/controller/clientregistration_controller.go
+++ b/kagenti-operator/internal/controller/clientregistration_controller.go
@@ -38,7 +38,9 @@ import (
// Well-known namespace resources (same contract as kagenti-webhook injector).
const (
authbridgeConfigConfigMap = "authbridge-config"
- keycloakAdminSecret = "keycloak-admin-secret"
+ // operatorConfigConfigMap is the centralized config in the operator namespace (kagenti-system)
+ operatorConfigConfigMap = "kagenti-operator-config"
+ keycloakAdminSecret = "keycloak-admin-secret"
// LabelClientRegistrationInject: when not "true", the operator registers the OAuth client and sets
// AnnotationKeycloakClientSecretName. Value "true" opts the workload into the legacy webhook
@@ -55,11 +57,16 @@ const (
// never reference a missing Secret; the webhook mounts the Secret for injected sidecars that use shared-data.
type ClientRegistrationReconciler struct {
client.Client
- // APIReader reads authbridge-config and keycloak-admin-secret from the API server. Those objects
- // are not in the manager's ConfigMap cache (see cmd/main.go cache.ByObject for ConfigMap).
+ // APIReader reads kagenti-operator-config and keycloak-admin-secret from the API server.
+ // These objects are not in the manager's ConfigMap cache (see cmd/main.go cache.ByObject).
+ // The keycloak-admin-secret is read from the operator namespace only (kagenti-system).
APIReader client.Reader
Scheme *runtime.Scheme
+ // OperatorNamespace is the namespace where the operator is running (e.g., "kagenti-system").
+ // Used to read centralized kagenti-operator-config ConfigMap.
+ OperatorNamespace string
+
SpireTrustDomain string
// KeycloakAdminTokenCache caches admin password-grant tokens by Keycloak URL and credentials to
// avoid a token request on every reconcile. If nil, PasswordGrantToken is used without caching.
@@ -153,20 +160,27 @@ func (r *ClientRegistrationReconciler) reconcileOne(
ns := owner.GetNamespace()
- ab, err := readAuthbridgeConfigMap(ctx, r.uncachedReader(), ns)
+ ab, err := readAuthbridgeConfigMap(ctx, r.uncachedReader(), r.OperatorNamespace, ns)
if err != nil {
- logger.Error(err, "read authbridge-config")
+ logger.Error(err, "read authbridge-config or kagenti-operator-config")
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}
if ab.KeycloakURL == "" || ab.KeycloakRealm == "" {
- logger.Info("waiting for KEYCLOAK_URL/KEYCLOAK_REALM in authbridge-config", "namespace", ns)
+ logger.Info("waiting for KEYCLOAK_URL/KEYCLOAK_REALM in kagenti-operator-config or authbridge-config",
+ "operatorNamespace", r.OperatorNamespace, "workloadNamespace", ns)
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}
+ // Read keycloak-admin-secret from the operator namespace (kagenti-system).
+ // This secret is created by the installation script and should only exist in the operator namespace.
+ // The operator uses these credentials to register OIDC clients in Keycloak on behalf of agents.
+ // Agent namespaces should NOT have a copy of this secret - only the operator needs access.
adminSecret := &corev1.Secret{}
- if err := r.uncachedReader().Get(ctx, types.NamespacedName{Namespace: ns, Name: keycloakAdminSecret}, adminSecret); err != nil {
+ if err := r.uncachedReader().Get(ctx, types.NamespacedName{Namespace: r.OperatorNamespace, Name: keycloakAdminSecret}, adminSecret); err != nil {
if apierrors.IsNotFound(err) {
- logger.Info("waiting for keycloak-admin-secret", "namespace", ns)
+ logger.Info("waiting for keycloak-admin-secret in operator namespace",
+ "operatorNamespace", r.OperatorNamespace,
+ "secretName", keycloakAdminSecret)
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}
return ctrl.Result{}, err
@@ -174,7 +188,8 @@ func (r *ClientRegistrationReconciler) reconcileOne(
adminUser := string(adminSecret.Data["KEYCLOAK_ADMIN_USERNAME"])
adminPass := string(adminSecret.Data["KEYCLOAK_ADMIN_PASSWORD"])
if adminUser == "" || adminPass == "" {
- logger.Info("keycloak-admin-secret missing username/password keys")
+ logger.Info("keycloak-admin-secret missing username/password keys",
+ "operatorNamespace", r.OperatorNamespace)
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}
@@ -296,9 +311,37 @@ type authbridgeConfig struct {
KeycloakAudienceScopeEnabled string
}
-func readAuthbridgeConfigMap(ctx context.Context, c client.Reader, namespace string) (authbridgeConfig, error) {
+// readAuthbridgeConfigMap reads Keycloak configuration from:
+// 1. First priority: operatorNamespace/kagenti-operator-config (centralized config)
+// 2. Fallback: workloadNamespace/authbridge-config (per-namespace config for backward compatibility)
+//
+// The centralized config is preferred for waypoint mode where agent pods don't need the ConfigMap.
+// Per-namespace config is still supported for sidecar mode and backward compatibility.
+func readAuthbridgeConfigMap(ctx context.Context, c client.Reader, operatorNamespace, workloadNamespace string) (authbridgeConfig, error) {
+ logger := log.FromContext(ctx)
+
+ // Try centralized operator config first (preferred for waypoint mode)
+ if operatorNamespace != "" {
+ cm := &corev1.ConfigMap{}
+ err := c.Get(ctx, types.NamespacedName{Namespace: operatorNamespace, Name: operatorConfigConfigMap}, cm)
+ if err == nil && cm.Data != nil {
+ config := extractAuthbridgeConfig(cm.Data)
+ // Only use operator config if it has the required fields
+ if config.KeycloakURL != "" && config.KeycloakRealm != "" {
+ logger.V(1).Info("using centralized operator config",
+ "configMap", operatorNamespace+"/"+operatorConfigConfigMap)
+ return config, nil
+ }
+ }
+ // If operator config doesn't exist or is incomplete, fall back to namespace config
+ if err != nil && !apierrors.IsNotFound(err) {
+ return authbridgeConfig{}, err
+ }
+ }
+
+ // Fall back to per-namespace authbridge-config (backward compatibility)
cm := &corev1.ConfigMap{}
- err := c.Get(ctx, types.NamespacedName{Namespace: namespace, Name: authbridgeConfigConfigMap}, cm)
+ err := c.Get(ctx, types.NamespacedName{Namespace: workloadNamespace, Name: authbridgeConfigConfigMap}, cm)
if apierrors.IsNotFound(err) {
return authbridgeConfig{}, nil
}
@@ -308,16 +351,22 @@ func readAuthbridgeConfigMap(ctx context.Context, c client.Reader, namespace str
if cm.Data == nil {
return authbridgeConfig{}, nil
}
+ logger.V(1).Info("using per-namespace authbridge config (fallback)",
+ "configMap", workloadNamespace+"/"+authbridgeConfigConfigMap)
+ return extractAuthbridgeConfig(cm.Data), nil
+}
+
+func extractAuthbridgeConfig(data map[string]string) authbridgeConfig {
return authbridgeConfig{
- KeycloakURL: cm.Data["KEYCLOAK_URL"],
- KeycloakRealm: cm.Data["KEYCLOAK_REALM"],
- SpireEnabled: cm.Data["SPIRE_ENABLED"],
- ClientAuthType: cm.Data["CLIENT_AUTH_TYPE"],
- SpiffeIDPAlias: cm.Data["SPIFFE_IDP_ALIAS"],
- KeycloakTokenExchangeEnabled: cm.Data["KEYCLOAK_TOKEN_EXCHANGE_ENABLED"],
- PlatformClientIDs: cm.Data["PLATFORM_CLIENT_IDS"],
- KeycloakAudienceScopeEnabled: cm.Data["KEYCLOAK_AUDIENCE_SCOPE_ENABLED"],
- }, nil
+ KeycloakURL: data["KEYCLOAK_URL"],
+ KeycloakRealm: data["KEYCLOAK_REALM"],
+ SpireEnabled: data["SPIRE_ENABLED"],
+ ClientAuthType: data["CLIENT_AUTH_TYPE"],
+ SpiffeIDPAlias: data["SPIFFE_IDP_ALIAS"],
+ KeycloakTokenExchangeEnabled: data["KEYCLOAK_TOKEN_EXCHANGE_ENABLED"],
+ PlatformClientIDs: data["PLATFORM_CLIENT_IDS"],
+ KeycloakAudienceScopeEnabled: data["KEYCLOAK_AUDIENCE_SCOPE_ENABLED"],
+ }
}
func parsePlatformClientIDs(raw string) []string {
diff --git a/kagenti-operator/internal/controller/clientregistration_controller_test.go b/kagenti-operator/internal/controller/clientregistration_controller_test.go
index 9acb183f..8d89829e 100644
--- a/kagenti-operator/internal/controller/clientregistration_controller_test.go
+++ b/kagenti-operator/internal/controller/clientregistration_controller_test.go
@@ -25,6 +25,7 @@ import (
const (
clientRegistrationTestNamespace = "test-ns"
clientRegistrationTestDeploymentName = "my-dep"
+ clientRegistrationOperatorNamespace = "kagenti-system"
)
func TestWorkloadWantsOperatorClientReg(t *testing.T) {
@@ -322,7 +323,7 @@ func TestClientRegistrationReconciler_Reconcile(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
scheme := clientRegistrationTestScheme(t)
c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(tc.objs...).Build()
- r := &ClientRegistrationReconciler{Client: c, Scheme: scheme}
+ r := &ClientRegistrationReconciler{Client: c, Scheme: scheme, OperatorNamespace: clientRegistrationOperatorNamespace}
res, err := r.Reconcile(ctx, req)
if err != nil {
t.Fatalf("Reconcile: %v", err)
@@ -350,9 +351,9 @@ func TestClientRegistrationReconciler_Reconcile(t *testing.T) {
clusterFeatureGatesConfigMap(true),
dep,
authbridgeConfigMapForTest(clientRegistrationTestNamespace, srv.URL),
- keycloakAdminSecretForTest(clientRegistrationTestNamespace),
+ keycloakAdminSecretForTest(clientRegistrationOperatorNamespace),
).Build()
- r := &ClientRegistrationReconciler{Client: c, Scheme: scheme}
+ r := &ClientRegistrationReconciler{Client: c, Scheme: scheme, OperatorNamespace: clientRegistrationOperatorNamespace}
res, err := r.Reconcile(ctx, req)
if err != nil || res != (ctrl.Result{}) {
t.Fatalf("got (%v, %v), want (zero Result, nil)", res, err)
diff --git a/kagenti-operator/internal/controller/namespace_waypoint_controller.go b/kagenti-operator/internal/controller/namespace_waypoint_controller.go
new file mode 100644
index 00000000..6fd5e3b1
--- /dev/null
+++ b/kagenti-operator/internal/controller/namespace_waypoint_controller.go
@@ -0,0 +1,339 @@
+/*
+Copyright 2025.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+ "context"
+ "fmt"
+
+ corev1 "k8s.io/api/core/v1"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/client"
+ "sigs.k8s.io/controller-runtime/pkg/handler"
+ "sigs.k8s.io/controller-runtime/pkg/log"
+ "sigs.k8s.io/controller-runtime/pkg/reconcile"
+
+ gwapiv1 "sigs.k8s.io/gateway-api/apis/v1"
+)
+
+const (
+ // Labels for namespace Istio ambient mesh configuration
+ IstioDiscoveryLabel = "istio-discovery"
+ IstioDataplaneModeLabel = "istio.io/dataplane-mode"
+ IstioUseWaypointLabel = "istio.io/use-waypoint"
+ IstioWaypointForLabel = "istio.io/waypoint-for"
+
+ // Label values
+ IstioDiscoveryEnabled = "enabled"
+ IstioDataplaneModeAmbient = "ambient"
+ IstioWaypointForAll = "all"
+
+ // Kagenti workload type label
+ KagentiTypeLabel = "kagenti.io/type"
+ KagentiTypeAgent = "agent"
+ KagentiTypeTool = "tool"
+
+ // GatewayClass for Istio waypoint
+ IstioWaypointGatewayClass = "istio-waypoint"
+
+ // Waypoint name suffix
+ WaypointNameSuffix = "-waypoint"
+)
+
+// NamespaceWaypointReconciler watches namespaces and ensures waypoint configuration
+// for namespaces containing Kagenti agents or tools.
+type NamespaceWaypointReconciler struct {
+ client.Client
+ Scheme *runtime.Scheme
+ // EnableWaypointProvisioning controls whether waypoint gateways are automatically created
+ EnableWaypointProvisioning bool
+}
+
+// +kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list;watch;update;patch
+// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
+// +kubebuilder:rbac:groups=apps,resources=deployments;statefulsets;daemonsets,verbs=get;list;watch
+// +kubebuilder:rbac:groups=batch,resources=jobs;cronjobs,verbs=get;list;watch
+// +kubebuilder:rbac:groups=gateway.networking.k8s.io,resources=gateways,verbs=get;list;watch;create;update;patch;delete
+
+// Reconcile ensures namespace waypoint configuration for namespaces with Kagenti workloads.
+func (r *NamespaceWaypointReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+ log := log.FromContext(ctx)
+
+ // DEBUG: Log every reconcile invocation to confirm function is called
+ log.Info("DEBUG: Reconcile function called", "namespace", req.Name, "enabled", r.EnableWaypointProvisioning)
+
+ if !r.EnableWaypointProvisioning {
+ log.V(1).Info("Waypoint provisioning disabled, skipping")
+ return ctrl.Result{}, nil
+ }
+
+ log.Info("Reconciling namespace for waypoint configuration", "namespace", req.Name)
+
+ namespace := &corev1.Namespace{}
+ if err := r.Get(ctx, req.NamespacedName, namespace); err != nil {
+ if apierrors.IsNotFound(err) {
+ log.V(1).Info("Namespace not found, may have been deleted")
+ return ctrl.Result{}, nil
+ }
+ log.Error(err, "Failed to get namespace")
+ return ctrl.Result{}, err
+ }
+
+ // Check if namespace is being deleted
+ if !namespace.ObjectMeta.DeletionTimestamp.IsZero() {
+ log.V(1).Info("Namespace is being deleted, skipping waypoint configuration")
+ return ctrl.Result{}, nil
+ }
+
+ // Check if namespace has any Kagenti agent or tool workloads
+ hasKagentiWorkloads, err := r.namespaceHasKagentiWorkloads(ctx, namespace.Name)
+ if err != nil {
+ log.Error(err, "Failed to check for Kagenti workloads in namespace")
+ return ctrl.Result{}, err
+ }
+
+ if !hasKagentiWorkloads {
+ log.V(1).Info("Namespace has no Kagenti workloads, skipping waypoint configuration")
+ return ctrl.Result{}, nil
+ }
+
+ log.Info("Namespace has Kagenti workloads, ensuring waypoint configuration")
+
+ // Ensure namespace has Istio ambient mesh labels
+ if err := r.ensureIstioLabels(ctx, namespace); err != nil {
+ log.Error(err, "Failed to ensure Istio labels on namespace")
+ return ctrl.Result{}, err
+ }
+
+ // Ensure waypoint gateway exists
+ if err := r.ensureWaypointGateway(ctx, namespace); err != nil {
+ log.Error(err, "Failed to ensure waypoint gateway")
+ return ctrl.Result{}, err
+ }
+
+ log.Info("Successfully configured waypoint for namespace")
+ return ctrl.Result{}, nil
+}
+
+// namespaceHasKagentiWorkloads checks if the namespace contains any pods with kagenti.io/type=agent or tool.
+func (r *NamespaceWaypointReconciler) namespaceHasKagentiWorkloads(ctx context.Context, namespace string) (bool, error) {
+ log := log.FromContext(ctx)
+
+ podList := &corev1.PodList{}
+ if err := r.List(ctx, podList, client.InNamespace(namespace)); err != nil {
+ return false, fmt.Errorf("failed to list pods in namespace %s: %w", namespace, err)
+ }
+
+ log.Info("Checking for Kagenti workloads in namespace", "namespace", namespace, "totalPods", len(podList.Items))
+
+ for _, pod := range podList.Items {
+ if kagentiType, ok := pod.Labels[KagentiTypeLabel]; ok {
+ if kagentiType == KagentiTypeAgent || kagentiType == KagentiTypeTool {
+ log.Info("Found Kagenti workload pod",
+ "namespace", namespace,
+ "pod", pod.Name,
+ "kagenti.io/type", kagentiType)
+ return true, nil
+ }
+ }
+ }
+
+ log.Info("No Kagenti workloads found in namespace", "namespace", namespace)
+ return false, nil
+}
+
+// ensureIstioLabels ensures the namespace has the required Istio ambient mesh labels.
+func (r *NamespaceWaypointReconciler) ensureIstioLabels(ctx context.Context, namespace *corev1.Namespace) error {
+ log := log.FromContext(ctx)
+
+ labels := namespace.GetLabels()
+ if labels == nil {
+ labels = make(map[string]string)
+ }
+
+ waypointName := namespace.Name + WaypointNameSuffix
+ updated := false
+
+ requiredLabels := map[string]string{
+ IstioDiscoveryLabel: IstioDiscoveryEnabled,
+ IstioDataplaneModeLabel: IstioDataplaneModeAmbient,
+ IstioUseWaypointLabel: waypointName,
+ }
+
+ for key, value := range requiredLabels {
+ if labels[key] != value {
+ log.Info("Adding/updating Istio label",
+ "namespace", namespace.Name,
+ "label", key,
+ "value", value)
+ labels[key] = value
+ updated = true
+ }
+ }
+
+ if updated {
+ namespace.SetLabels(labels)
+ if err := r.Update(ctx, namespace); err != nil {
+ return fmt.Errorf("failed to update namespace labels: %w", err)
+ }
+ log.Info("Updated namespace Istio labels", "namespace", namespace.Name)
+ } else {
+ log.V(1).Info("Namespace already has correct Istio labels", "namespace", namespace.Name)
+ }
+
+ return nil
+}
+
+// ensureWaypointGateway ensures a waypoint gateway exists in the namespace.
+func (r *NamespaceWaypointReconciler) ensureWaypointGateway(ctx context.Context, namespace *corev1.Namespace) error {
+ log := log.FromContext(ctx)
+
+ gatewayName := namespace.Name + WaypointNameSuffix
+
+ gateway := &gwapiv1.Gateway{}
+ err := r.Get(ctx, client.ObjectKey{
+ Name: gatewayName,
+ Namespace: namespace.Name,
+ }, gateway)
+
+ if err == nil {
+ log.V(1).Info("Waypoint gateway already exists", "namespace", namespace.Name, "gateway", gatewayName)
+ return r.validateWaypointLabels(ctx, gateway)
+ }
+
+ if !apierrors.IsNotFound(err) {
+ return fmt.Errorf("failed to get waypoint gateway: %w", err)
+ }
+
+ // Create waypoint gateway
+ log.Info("Creating waypoint gateway", "namespace", namespace.Name, "gateway", gatewayName)
+
+ gatewayClassName := gwapiv1.ObjectName(IstioWaypointGatewayClass)
+ protocolHBONE := gwapiv1.ProtocolType("HBONE")
+ portNumber := gwapiv1.PortNumber(15008)
+
+ gateway = &gwapiv1.Gateway{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: gatewayName,
+ Namespace: namespace.Name,
+ Labels: map[string]string{
+ IstioWaypointForLabel: IstioWaypointForAll,
+ },
+ },
+ Spec: gwapiv1.GatewaySpec{
+ GatewayClassName: gatewayClassName,
+ Listeners: []gwapiv1.Listener{
+ {
+ Name: "mesh",
+ Port: portNumber,
+ Protocol: protocolHBONE,
+ },
+ },
+ },
+ }
+
+ if err := r.Create(ctx, gateway); err != nil {
+ return fmt.Errorf("failed to create waypoint gateway: %w", err)
+ }
+
+ log.Info("Successfully created waypoint gateway", "namespace", namespace.Name, "gateway", gatewayName)
+ return nil
+}
+
+// validateWaypointLabels ensures the waypoint gateway has the correct labels.
+func (r *NamespaceWaypointReconciler) validateWaypointLabels(ctx context.Context, gateway *gwapiv1.Gateway) error {
+ log := log.FromContext(ctx)
+
+ labels := gateway.GetLabels()
+ if labels == nil {
+ labels = make(map[string]string)
+ }
+
+ if labels[IstioWaypointForLabel] != IstioWaypointForAll {
+ log.Info("Updating waypoint gateway label",
+ "namespace", gateway.Namespace,
+ "gateway", gateway.Name,
+ "label", IstioWaypointForLabel,
+ "value", IstioWaypointForAll)
+
+ labels[IstioWaypointForLabel] = IstioWaypointForAll
+ gateway.SetLabels(labels)
+
+ if err := r.Update(ctx, gateway); err != nil {
+ return fmt.Errorf("failed to update waypoint gateway labels: %w", err)
+ }
+ }
+
+ return nil
+}
+
+// SetupWithManager sets up the controller with the Manager.
+func (r *NamespaceWaypointReconciler) SetupWithManager(mgr ctrl.Manager) error {
+ ctrl.Log.Info("DEBUG: Setting up NamespaceWaypointReconciler controller", "enabled", r.EnableWaypointProvisioning)
+
+ err := ctrl.NewControllerManagedBy(mgr).
+ For(&corev1.Namespace{}).
+ Watches(
+ &corev1.Pod{},
+ handler.EnqueueRequestsFromMapFunc(r.podToNamespaceRequest),
+ ).
+ Complete(r)
+
+ if err != nil {
+ ctrl.Log.Error(err, "DEBUG: Failed to setup NamespaceWaypointReconciler controller")
+ } else {
+ ctrl.Log.Info("DEBUG: Successfully setup NamespaceWaypointReconciler controller")
+ }
+
+ return err
+}
+
+// podToNamespaceRequest maps Pod events to Namespace reconcile requests.
+// This ensures we reconcile the namespace when pods with kagenti.io/type labels are created.
+func (r *NamespaceWaypointReconciler) podToNamespaceRequest(ctx context.Context, obj client.Object) []reconcile.Request {
+ log := log.FromContext(ctx)
+
+ pod, ok := obj.(*corev1.Pod)
+ if !ok {
+ return nil
+ }
+
+ // Only trigger namespace reconciliation if this is a Kagenti workload
+ if kagentiType, ok := pod.Labels[KagentiTypeLabel]; ok {
+ if kagentiType == KagentiTypeAgent || kagentiType == KagentiTypeTool {
+ reconcileReq := []reconcile.Request{
+ {
+ NamespacedName: client.ObjectKey{
+ Name: pod.Namespace,
+ },
+ },
+ }
+ log.Info("Pod event triggered namespace waypoint reconciliation",
+ "pod", pod.Name,
+ "namespace", pod.Namespace,
+ "kagenti.io/type", kagentiType,
+ "reconcileRequest", reconcileReq)
+ return reconcileReq
+ }
+ }
+
+ log.V(2).Info("Pod does not have kagenti.io/type label, skipping", "pod", pod.Name)
+ return nil
+}