Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions kagenti-operator/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import (
"github.com/kagenti/operator/internal/signature"
"github.com/kagenti/operator/internal/tekton"
webhookv1alpha1 "github.com/kagenti/operator/internal/webhook/v1alpha1"
gwapiv1 "sigs.k8s.io/gateway-api/apis/v1"
// +kubebuilder:scaffold:imports
)

Expand All @@ -60,6 +61,7 @@ func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(agentv1alpha1.AddToScheme(scheme))
utilruntime.Must(tekton.AddToScheme(scheme))
utilruntime.Must(gwapiv1.Install(scheme))
// +kubebuilder:scaffold:scheme
}

Expand All @@ -78,14 +80,18 @@ func main() {
var signatureAuditMode bool
var enforceNetworkPolicies bool
var enableOperatorClientRegistration bool
var enableWaypointProvisioning bool

var operatorNamespace string
var spireTrustDomain string
var spireTrustBundleConfigMapName string
var spireTrustBundleConfigMapNS string
var spireTrustBundleConfigMapKey string
var spireTrustBundleRefreshInterval time.Duration
var svidExpiryGracePeriod time.Duration

flag.StringVar(&operatorNamespace, "operator-namespace", os.Getenv("POD_NAMESPACE"),
"Namespace where the operator is running (default: POD_NAMESPACE env var, fallback: 'kagenti-system')")
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
Expand All @@ -112,6 +118,8 @@ func main() {
flag.BoolVar(&enableOperatorClientRegistration, "enable-operator-client-registration", false,
"Reconcile Keycloak client registration for agent/tool workloads unless "+
"kagenti.io/client-registration-inject=true (legacy sidecar)")
flag.BoolVar(&enableWaypointProvisioning, "enable-waypoint-provisioning", true,
"Automatically provision Istio waypoint gateways for namespaces with Kagenti workloads")

flag.StringVar(&spireTrustDomain, "spire-trust-domain", "",
"SPIRE trust domain for identity binding (e.g. 'example.org')")
Expand All @@ -134,6 +142,12 @@ func main() {

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

// Default operator namespace if not set
if operatorNamespace == "" {
operatorNamespace = "kagenti-system"
setupLog.Info("operator-namespace not set, using default", "namespace", operatorNamespace)
}

// Mitigate CVE-2023-44487 (HTTP/2 Rapid Reset).
disableHTTP2 := func(c *tls.Config) {
c.NextProtos = []string{"http/1.1"}
Expand Down Expand Up @@ -201,7 +215,10 @@ func main() {
Scheme: scheme,
Metrics: metricsServerOptions,
Cache: cache.Options{
DefaultNamespaces: getNamespacesToWatch(),
// Note: DefaultNamespaces is intentionally not set (removed getNamespacesToWatch()).
// When not set, the cache defaults to cluster-wide for all resources except those
// explicitly scoped in ByObject below.
//
// Scope the ConfigMap informer to only kagenti-relevant ConfigMaps.
// Without this, the controller would cache ALL ConfigMaps cluster-wide.
//
Expand Down Expand Up @@ -230,6 +247,10 @@ func main() {
},
},
},
// NOTE: All other resources (Namespace, Pod, Deployment, StatefulSet, Gateway)
// are intentionally NOT in ByObject. With DefaultNamespaces not set, they will
// automatically use the default cluster-wide cache, which is what we want.
// Explicitly adding them to ByObject was preventing controllers from starting.
},
},
WebhookServer: webhookServer,
Expand Down Expand Up @@ -335,13 +356,15 @@ func main() {
Client: mgr.GetClient(),
APIReader: mgr.GetAPIReader(),
Scheme: mgr.GetScheme(),
OperatorNamespace: operatorNamespace,
SpireTrustDomain: spireTrustDomain,
KeycloakAdminTokenCache: &keycloak.CachedAdminTokenProvider{},
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "ClientRegistration")
os.Exit(1)
}
setupLog.Info("Operator-managed client registration controller enabled")
setupLog.Info("Operator-managed client registration controller enabled",
"operatorNamespace", operatorNamespace)
}

if controller.TektonConfigCRDExists(mgr.GetConfig()) {
Expand All @@ -353,6 +376,18 @@ func main() {
}
}

if enableWaypointProvisioning {
if err = (&controller.NamespaceWaypointReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
EnableWaypointProvisioning: enableWaypointProvisioning,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NamespaceWaypoint")
os.Exit(1)
}
setupLog.Info("Waypoint provisioning controller enabled")
}

if err = webhookv1alpha1.SetupAgentCardWebhookWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "AgentCard")
os.Exit(1)
Expand Down
49 changes: 49 additions & 0 deletions kagenti-operator/docs/diagrams/agent-communication-flow.mmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
%% Agent-to-Agent Communication Flow with Token Exchange
sequenceDiagram
participant A1 as Agent Pod<br/>(team1/agent-a)
participant KC as Keycloak
participant Z1 as ztunnel<br/>(team1 node)
participant W1 as Waypoint Gateway<br/>(team1-waypoint)
participant W2 as Waypoint Gateway<br/>(team2-waypoint)
participant Z2 as ztunnel<br/>(team2 node)
participant A2 as Agent Pod<br/>(team2/agent-b)

Note over A1: Step 1: Obtain Access Token
A1->>A1: Read credentials from<br/>/shared/client-id.txt<br/>/shared/client-secret.txt
A1->>KC: POST /token<br/>grant_type=client_credentials<br/>client_id=team1/agent-a<br/>client_secret=<secret>
KC-->>A1: JWT Access Token<br/>aud: [team2/agent-b, ...]<br/>azp: team1/agent-a<br/>exp: 300s

Note over A1,KC: Step 2: (Optional) Token Exchange
A1->>KC: POST /token<br/>grant_type=token-exchange<br/>subject_token=<access_token><br/>audience=team2/agent-b
KC-->>A1: Exchanged JWT Token<br/>aud: team2/agent-b<br/>azp: team1/agent-a

Note over A1,A2: Step 3: Cross-Namespace HTTP Request
A1->>Z1: HTTP GET /api/task<br/>Host: agent-b.team2.svc.cluster.local<br/>Authorization: Bearer <jwt>

Note over Z1: L4 mTLS Tunnel
Z1->>W1: mTLS encrypted traffic<br/>Source: team1/agent-a

Note over W1: L7 Processing
W1->>W1: 1. Extract JWT from Authorization header
W1->>W1: 2. Validate JWT signature (Keycloak JWKS)
W1->>W1: 3. Check exp, iss, aud claims
W1->>W1: 4. Verify audience includes team2/agent-b

Note over W1,W2: Cross-Namespace L4 mTLS
W1->>W2: mTLS encrypted traffic<br/>Validated request

Note over W2: L7 Validation (Defense in Depth)
W2->>W2: 1. Re-validate JWT signature
W2->>W2: 2. Check audience matches team2/agent-b
W2->>W2: 3. Apply AuthorizationPolicy (if configured)

W2->>Z2: Forward validated request
Z2->>A2: HTTP GET /api/task<br/>Headers: x-forwarded-client-cert, etc.

A2-->>Z2: HTTP 200 OK<br/>Response data
Z2-->>W2: Response
W2-->>W1: Response
W1-->>Z1: Response
Z1-->>A1: HTTP 200 OK<br/>Response data

Note over A1,A2: ✅ Authenticated & Authorized Communication Complete
71 changes: 71 additions & 0 deletions kagenti-operator/docs/diagrams/operator-reconciliation.mmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
%% Operator Reconciliation Flows
graph TB
subgraph "NamespaceWaypointReconciler"
NWR_WATCH[Watch Namespaces<br/>& Pods]
NWR_CHECK{Has kagenti.io/type=agent<br/>workload pods?}
NWR_GATEWAY{Gateway exists?}
NWR_CREATE[Create Gateway Resource]
NWR_LABELS{Istio labels applied?}
NWR_APPLY[Apply Istio Labels:<br/>- istio-discovery: enabled<br/>- istio.io/dataplane-mode: ambient<br/>- istio.io/use-waypoint: <ns>-waypoint]
NWR_DONE[Reconcile Complete]

NWR_WATCH --> NWR_CHECK
NWR_CHECK -->|Yes| NWR_GATEWAY
NWR_CHECK -->|No| NWR_DONE
NWR_GATEWAY -->|No| NWR_CREATE
NWR_GATEWAY -->|Yes| NWR_LABELS
NWR_CREATE --> NWR_LABELS
NWR_LABELS -->|No| NWR_APPLY
NWR_LABELS -->|Yes| NWR_DONE
NWR_APPLY --> NWR_DONE
end

subgraph "ClientRegistrationReconciler"
CRR_WATCH[Watch Deployments<br/>& StatefulSets]
CRR_FILTER{kagenti.io/type=agent<br/>AND NOT client-registration-inject=true?}
CRR_GATES{Feature gates enabled?}
CRR_CONFIG[Read kagenti-operator-config<br/>or authbridge-config]
CRR_ADMIN[Read keycloak-admin-secret<br/>from operator namespace]
CRR_CLIENTID[Compute Client ID:<br/>namespace/workload OR<br/>spiffe://trust-domain/ns/.../sa/...]
CRR_REGISTER[Register/Fetch OIDC Client<br/>in Keycloak]
CRR_AUDIENCE[Ensure Audience Scopes<br/>for platform clients]
CRR_SECRET{Secret exists?}
CRR_CREATE_SEC[Create Secret:<br/>kagenti-keycloak-client-credentials-<hash>]
CRR_UPDATE_SEC[Update Secret if changed]
CRR_ANNOTATE[Annotate Pod Template:<br/>kagenti.io/keycloak-client-credentials-secret-name]
CRR_DONE[Reconcile Complete]

CRR_WATCH --> CRR_FILTER
CRR_FILTER -->|No| CRR_DONE
CRR_FILTER -->|Yes| CRR_GATES
CRR_GATES -->|Disabled| CRR_DONE
CRR_GATES -->|Enabled| CRR_CONFIG
CRR_CONFIG --> CRR_ADMIN
CRR_ADMIN --> CRR_CLIENTID
CRR_CLIENTID --> CRR_REGISTER
CRR_REGISTER --> CRR_AUDIENCE
CRR_AUDIENCE --> CRR_SECRET
CRR_SECRET -->|No| CRR_CREATE_SEC
CRR_SECRET -->|Yes| CRR_UPDATE_SEC
CRR_CREATE_SEC --> CRR_ANNOTATE
CRR_UPDATE_SEC --> CRR_ANNOTATE
CRR_ANNOTATE --> CRR_DONE
end

subgraph "External Systems"
KEYCLOAK[Keycloak<br/>OIDC Provider]
K8S_API[Kubernetes API<br/>Gateway, Secret, ConfigMap]
end

NWR_CREATE -.creates.-> K8S_API
NWR_APPLY -.patches.-> K8S_API
CRR_REGISTER -.HTTP API.-> KEYCLOAK
CRR_AUDIENCE -.HTTP API.-> KEYCLOAK
CRR_CREATE_SEC -.creates.-> K8S_API
CRR_ANNOTATE -.patches.-> K8S_API

style NWR_DONE fill:#bfb,stroke:#333,stroke-width:2px
style CRR_DONE fill:#bfb,stroke:#333,stroke-width:2px
style CRR_ADMIN fill:#f9f,stroke:#333,stroke-width:2px
style KEYCLOAK fill:#ffd,stroke:#333,stroke-width:2px
style K8S_API fill:#bbf,stroke:#333,stroke-width:2px
68 changes: 68 additions & 0 deletions kagenti-operator/docs/diagrams/security-architecture.mmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
%% Security Architecture - Centralized Secrets
graph TB
subgraph "kagenti-system Namespace (Operator)"
OP[Kagenti Operator Pod]
ADMIN_SEC[🔒 keycloak-admin-secret<br/>KEYCLOAK_ADMIN_USERNAME<br/>KEYCLOAK_ADMIN_PASSWORD]
OP_CFG[kagenti-operator-config<br/>KEYCLOAK_URL<br/>KEYCLOAK_REALM<br/>...]

OP -->|reads| ADMIN_SEC
OP -->|reads| OP_CFG
end

subgraph "Keycloak (External)"
KC_ADMIN[Admin REST API<br/>/admin/realms/kagenti/clients]
KC_TOKEN[Token Endpoint<br/>/realms/kagenti/protocol/openid-connect/token]
end

subgraph "team1 Namespace (Agents)"
AGENT1[Agent Pod<br/>team1/agent-a]
CLIENT_SEC1[🔑 Client Credentials Secret<br/>client-id: team1/agent-a<br/>client-secret: <random-32-chars>]

AGENT1 -->|mounts read-only| CLIENT_SEC1
AGENT1 -.❌ NO ACCESS.-> ADMIN_SEC
end

subgraph "team2 Namespace (Agents)"
AGENT2[Agent Pod<br/>team2/agent-b]
CLIENT_SEC2[🔑 Client Credentials Secret<br/>client-id: team2/agent-b<br/>client-secret: <random-32-chars>]

AGENT2 -->|mounts read-only| CLIENT_SEC2
AGENT2 -.❌ NO ACCESS.-> ADMIN_SEC
end

subgraph "team3 Namespace (Agents)"
AGENT3[Agent Pod<br/>team3/agent-c]
CLIENT_SEC3[🔑 Client Credentials Secret<br/>client-id: team3/agent-c<br/>client-secret: <random-32-chars>]

AGENT3 -->|mounts read-only| CLIENT_SEC3
AGENT3 -.❌ NO ACCESS.-> ADMIN_SEC
end

%% Operator uses admin credentials to manage clients
OP -->|authenticates with<br/>admin credentials| KC_ADMIN
OP -->|registers OIDC clients| KC_ADMIN
OP -->|creates secrets| CLIENT_SEC1
OP -->|creates secrets| CLIENT_SEC2
OP -->|creates secrets| CLIENT_SEC3

%% Agents use client credentials for tokens
AGENT1 -->|client_credentials grant| KC_TOKEN
AGENT2 -->|client_credentials grant| KC_TOKEN
AGENT3 -->|client_credentials grant| KC_TOKEN

%% Security boundaries
classDef adminSecret fill:#f9f,stroke:#900,stroke-width:3px
classDef clientSecret fill:#bfb,stroke:#090,stroke-width:2px
classDef noAccess stroke:#f00,stroke-dasharray: 5 5,stroke-width:2px

class ADMIN_SEC adminSecret
class CLIENT_SEC1,CLIENT_SEC2,CLIENT_SEC3 clientSecret

%% Annotations
note1[🔒 Admin Secret<br/>- ONLY in operator namespace<br/>- NEVER in agent namespaces<br/>- Used ONLY by operator]
note2[🔑 Client Secrets<br/>- One per agent workload<br/>- Owner reference to workload<br/>- Auto-deleted with workload]
note3[✅ Security Benefits<br/>- Reduced attack surface<br/>- Principle of least privilege<br/>- Centralized rotation<br/>- Simplified auditing]

style note1 fill:#fff,stroke:#900,stroke-width:2px
style note2 fill:#fff,stroke:#090,stroke-width:2px
style note3 fill:#fff,stroke:#009,stroke-width:2px
86 changes: 86 additions & 0 deletions kagenti-operator/docs/diagrams/waypoint-architecture.mmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
%% High-Level Waypoint Mode Architecture
graph TB
subgraph "Operator Namespace (kagenti-system)"
OP[kagenti-controller-manager]
NWRC[NamespaceWaypointReconciler]
CRC[ClientRegistrationReconciler]
OPSEC[keycloak-admin-secret<br/>ADMIN CREDENTIALS]
OPCFG[kagenti-operator-config<br/>KEYCLOAK_URL, REALM, etc.]

OP --> NWRC
OP --> CRC
CRC --> OPSEC
CRC --> OPCFG
NWRC --> OPCFG
end

subgraph "Keycloak (External)"
KC[Keycloak Server]
REALM[Realm: kagenti]
KC --> REALM
end

subgraph "Agent Namespace: team1"
NS1[Namespace<br/>labels: kagenti.io/type=agent]
GW1[Istio Gateway<br/>team1-waypoint]
GWP1[Waypoint Pod<br/>L7 Envoy Proxy]

AGENT1[Agent Pod<br/>1 container]
SEC1[Client Credentials Secret<br/>client-id.txt<br/>client-secret.txt]

NS1 -.istio labels.-> GW1
GW1 --> GWP1
AGENT1 -.mounts.-> SEC1
end

subgraph "Agent Namespace: team2"
NS2[Namespace<br/>labels: kagenti.io/type=agent]
GW2[Istio Gateway<br/>team2-waypoint]
GWP2[Waypoint Pod<br/>L7 Envoy Proxy]

AGENT2[Agent Pod<br/>1 container]
SEC2[Client Credentials Secret<br/>client-id.txt<br/>client-secret.txt]

NS2 -.istio labels.-> GW2
GW2 --> GWP2
AGENT2 -.mounts.-> SEC2
end

subgraph "Istio Control Plane"
ISTIOD[istiod<br/>Control Plane]
ZTUNNEL[ztunnel DaemonSet<br/>L4 mTLS]
end

%% Operator provisions infrastructure
NWRC -->|creates| GW1
NWRC -->|creates| GW2
NWRC -->|applies labels| NS1
NWRC -->|applies labels| NS2

%% Operator manages client registration
CRC -->|registers OIDC client| KC
CRC -->|creates secret| SEC1
CRC -->|creates secret| SEC2

%% Istio manages waypoint pods
ISTIOD -.manages.-> GWP1
ISTIOD -.manages.-> GWP2
ISTIOD -.manages.-> ZTUNNEL

%% Data plane traffic
AGENT1 -->|L4 mTLS| ZTUNNEL
ZTUNNEL -->|routes to| GWP1
GWP1 -->|L7 proxy| GWP2
GWP2 -->|routes to| AGENT2

%% Authentication flow
AGENT1 -.obtains token.-> KC
AGENT2 -.obtains token.-> KC

style OPSEC fill:#f9f,stroke:#333,stroke-width:2px
style OPCFG fill:#bbf,stroke:#333,stroke-width:2px
style SEC1 fill:#bfb,stroke:#333,stroke-width:2px
style SEC2 fill:#bfb,stroke:#333,stroke-width:2px
style GWP1 fill:#ffd,stroke:#333,stroke-width:2px
style GWP2 fill:#ffd,stroke:#333,stroke-width:2px
style ZTUNNEL fill:#ddf,stroke:#333,stroke-width:2px
Loading
Loading